sample1 = Neurons1 sample2 = Neurons2 sample3 = Glia1 - Astrocytes (CD44+) sample4 = Glia2 - Radial Glia (CD44-)
In HPC I have run steps of scrnabox (custom pipeline in progress) 1. Cell Ranger for feature seq 2. Create Seurat Objects 3. Apply minimum filtering and calculate percent mitochondria.
I have technical 3 replicates with hashtag labels at this point I haven’t yet demultiplex the hashtags. The data here will be treated as one sample. I sorted three separate samples and pooled them together.
0 - NPC or early neurons 1 - immature excitatory neurons 2 - NPC or early neurons 3 - RG or Oligos 4- Dopaminergic neurons - possibly early 5 - NPC or early neurons 6 - Radial GLia
After predicting with the brain data I think using a higher cluster number will work better
Library of tissue cell types for up regulated genes per cluster 0 - hypothalmus, DA A13 1- neural plate, Radial Glia 2 - Neural stem 3 - stromal, astro OPC 4 - Neurons 5 - endothelial, pericyte 6 - maybe neurons maybe not
After predicting with the brain data Bhaduri Midbrain and Striatum - choose main cell types to label Tried the predictions with more clusters 0-9 res 1.2
Next Repeat everything for Neurons2
# explore filtering
seu <- Neurons2
seu
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 2000)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 350)
VlnPlot(seu, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)
# filter more cells
seu.ft <- subset(seu, subset = nFeature_RNA > 300 & nCount_RNA > 500 & nCount_RNA < 10000)
seu.ft
# 17604 samples with 250 nFeature_RNA
# 9657 with nFeature 300 and nCOunt 500
Doublet finder
suppressMessages(require(DoubletFinder))
# filtering out MALAT1 and mitochondrial genes
seu.ft <- seu.ft[!grepl("MALAT1", rownames(seu)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]
# like in the tutorial I'm following MALAT1 is the top most expressed gene. The top genes are a lot of MT and Ribosomal genes
seu.ft[["percent.rb"]] <- PercentageFeatureSet(seu.ft, pattern = "^RP")
seu.d = NormalizeData(seu.ft)
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 30)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
nExp <- round(ncol(seu.d) * 0.08) # expect more doublets because there is a lot more cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
DimPlot(seu.d, group.by = DF.name) + NoAxes())
VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
Remove the doublet cells
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu)
# 9657 cells pre filter
# 8884 cells after filtering
# note the percent doubles expected is close to the percent detected
Repeat workflow with doublet removed data and find clusters for
seu <- NormalizeData(seu.d, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, reduction = "pca", n.neighbors = 43, dims = 1:30)
DimPlot(seu, reduction = "umap")
seu.q <- FindNeighbors(seu, dims = 1:25, k.param = 43)
seu.q <- FindClusters(seu.q, resolution = c(0,0.2,0.4,0.6))
library(clustree)
clustree(seu.q)
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.2')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.4')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.6')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.1.2')
Label cell types using the label transfer
# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")
# Midbrain AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")
# Midbrain AIW002 60 days in culture
AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")
# query
#seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/NeuronsFilteredSeu28092022.RDS")
#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"
# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# clusters don't break up by the predicted cell types
############ another predictions now using the AIW organoids
Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"
anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAIW.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAIW.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid
Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"
anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# save ojbect with predicitons
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2PredictionsSeu30092022.RDS")
Predict from Brain Bhahani
seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2LabelsSeu30092022.RDS")
# read in the reference dataset
# from Bhaduri midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")
table(seu.r$cell_type)
astrocyte dividing endothelial interneuron microglia neuron
310 351 321 4 30 1596
oligodendrocyte radial glia
195 162
table(seu.r$cell_class)
dividing endothelial glia microglia neuron
351 321 667 30 1600
table(seu.r$cell_cluster)
Astrocyte_4 Astrocyte_5 Dividing_11 Dividing_2 Dividing_3
211 99 10 71 103
Dividing_4 Dividing_42 Dividing_5 Endo_11 Endo_13
38 1 128 28 1
Endo_15 Endo_4 Endo_5 Endo_7 Endo_8
10 144 14 93 31
GW14_Microglia_27 Interneuron_1 Interneuron_24 Interneuron_9 Microglia_10
4 1 2 1 9
Microglia_2 Microglia_8 Neuron_100 Neuron_2 Neuron_22
9 8 1 57 44
Neuron_37 Neuron_4 Neuron_5 Neuron_98 Oligo_11
104 249 1049 92 35
Oligo_2 Oligo_6 Oligo_8 RG_1 RG_37
58 83 19 80 1
RG_4 RG_45 RG_6 RG_7
22 33 25 1
Idents(seu.r) <- "cell_cluster"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1841 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 1051 anchors
Filtering anchors
Retained 318 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Astrocyte_4 Dividing_3 Dividing_5 Endo_4 Endo_7 Endo_8 Neuron_2 Neuron_22
2105 24 20 987 346 4 32 1
Neuron_37 Neuron_4 Neuron_98 Oligo_11 Oligo_2 RG_1 RG_45 RG_6
3353 1070 866 2 5 6 1 62
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
Neuron_4 Neuron_37 Astrocyte_4 Neuron_98 Endo_4 Endo_7 RG_6 RG_1
1070 3353 2105 866 987 346 62 6
Dividing_3 Endo_8 Oligo_11 Neuron_2 Dividing_5 Oligo_2 Neuron_22 RG_45
24 4 2 32 20 5 1 1
DimPlot(seu.q, group.by = 'Bha.mid.stri.pred')

DimPlot(seu.q, group.by = 'subgroups')

# do the predictions differ with the main cell type groups instead of the cluster in the reference data?
Idents(seu.r) <- "cell_type"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1841 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 1051 anchors
Filtering anchors
Retained 318 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_type)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
astrocyte dividing endothelial neuron oligodendrocyte radial glia
1294 34 1386 5606 53 511
DimPlot(seu.q, group.by = 'predicted.id')

# good largest prediction is neurons.
See the top predictions for each cluster in Neurons2 res 06
pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table
pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table
What cell types are predicted across the 3 references
0 - Neurons early , NPC, neurons excitatory 1 - Neurons early, NPC 2 - Neurons early, NPC, neurons excitatory some DA neurons 3 - Oligo, RG, 4 - Excitatory neurons, NPC, early neurons 5 - DA neurons, early DA neurons 6 - neurons immature NPC 7 - DA neurons 8 - RG, oligo, OPC, NPC 9 - Radial Glia 10 - NPC, neurons, oligo 11 - NPC, neurons, oligo
Find cluster markers and see how those would annotate
Idents(seu.q) <- 'RNA_snn_res.0.6'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)
top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Neurons1ClusterMarkers11.csv")
Cluster 0 has fewer markers. 2 and 5 have similar up reg markers 3 and 4 also overlap
Look at the cluster markers in cell type libraries for Neurons 2
library(enrichR)
db <- c('Allen_Brain_Atlas_up','Descartes_Cell_Types_and_Tissue_2021',
'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')
# enrichr(genes, databases = NULL)
# cluster 0
N1.c0 <- ClusterMarkers %>% filter(cluster == 0 & avg_log2FC > 0)
genes <- N1.c0$gene
N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
# cluster 1
N1.c1 <- ClusterMarkers %>% filter(cluster == 1 & avg_log2FC > 0)
genes <- N1.c1$gene
N1.c1.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c1.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c1.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c1.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c1.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c1.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# cluster 1; olfactory bulb, neural plate, maybe Radial Glia,
N1.c2 <- ClusterMarkers %>% filter(cluster == 2 & avg_log2FC > 0)
genes <- N1.c2$gene
N1.c2.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c2.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c2.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c2.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c2.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c2.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# cluster 3
N1.c3 <- ClusterMarkers %>% filter(cluster == 3 & avg_log2FC > 0)
genes <- N1.c3$gene
N1.c3.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c3.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c3.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c3.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c3.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c3.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# cluster 4
N1.c4 <- ClusterMarkers %>% filter(cluster == 4 & avg_log2FC > 0)
genes <- N1.c4$gene
N1.c4.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c4.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c4.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c4.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c4.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c4.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# cluster 5
N1.c5 <- ClusterMarkers %>% filter(cluster == 5 & avg_log2FC > 0)
genes <- N1.c5$gene
N1.c5.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c5.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c5.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c5.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c5.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c5.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# cluster 6
N1.c6 <- ClusterMarkers %>% filter(cluster == 6 & avg_log2FC > 0)
genes <- N1.c6$gene
N1.c6.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c6.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c6.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c6.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c6.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c6.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# other clusters - change the cluster number
N1.c6 <- ClusterMarkers %>% filter(cluster == 11 & avg_log2FC > 0)
genes <- N1.c6$gene
N1.c6.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c6.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c6.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c6.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c6.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c6.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
cluster 0 - astrocyte, radial glia, microglia, striatum - very few genes in these terms cluster 1 - adrenal, thalmus, endothelial, astrocytes, neurons clusters 2 - neural plate, stratum, neurons, NPC, stem, astro,neuroendocrine cluster 3 - brain molecular layer, endothelial, astrocyte embryonic, cluster 4 - DG, striatum CA3, GABAergic neurons cluster 5 - DG, neurons, Glutamatergic neurons cluster 6 - neurons, astrocyte, microglia, GABAneurons cluster 7 - neurons, glut and gaba cluster 8 - astrocyte, endothelial, pericyte cluster 9 - epithelial, embryonic astrocytes, GABA neurons cluster 10 - epithelial cluster 11 - endothelial, immune cells T cells
Expression of markers genes in Neurons2
feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = feature_list) +RotatedAxis()
PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")
DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()
ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")
feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = feature_list)+RotatedAxis()
# no proliferation marker expression PCNA or MKI67
# cluster 4 DA neurons - shows early neuron marker and low PAX 4
# cluster 3 has higher SOX2 - neuroblast marker / NPC marker
mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6")
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression
### glia markers
microglia = c("PTPRC","AIF1","ADGRE1") # ADGRE1 is a microglia marker F4/80, CD45 is PTPRC, gene name IBA1 is AIF1
astolgNPCpromicro = c("GFAP","S100B","SLC1A2","MBP","SOX10","SPP1","DCX","NEUROD1","TBR1","PCNA","MKI67","PTPRC","AIF1","ADGRE1")
# note GLT1 is EAAT2 which is SLC1A2 glutatmate transporter
# epithelial
epi = c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1") # e-cadherin is CDH1
DoHeatmap(seu.q, features = astolgNPCpromicro, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = astolgNPCpromicro, group.by = 'RNA_snn_res.0.6')+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression
DoHeatmap(seu.q, features = epi, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = epi, group.by = 'RNA_snn_res.0.6')+RotatedAxis()
# also add Radial glia marker overlap with Glia and Neurons
features <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
"ITGB5","IL6ST")
DoHeatmap(seu.q, features = features, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = features, group.by = 'RNA_snn_res.0.6')+RotatedAxis()
Label the Neuron2 FACS population
Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("Neurons1","immatureNeurons1","Neurons2",
"Other","DAneurons1","DAneurons2","immatureNeurons2",
"DAneurons3","RG","immatureNeurons1","Epithelial","Endothelial")
unique(seu.q$RNA_snn_res.0.6)
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)
# label again with just numbering
# then I'll find markers in pairs to distinguish groups.
Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("Neurons1","Neurons2","Neurons3",
"Other","DAneurons1","DAneurons2","Neurons4",
"DAneurons3","RG","Neurons2","Epithelial","Endothelial")
unique(seu.q$RNA_snn_res.0.6)
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$number.groups <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'number.groups', repel = TRUE)
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2LabelsSeu30092022.RDS")
Find markers in pairs to go back and classify the subgroups. Will need to return to this for Neurons1 FACS
Neurons 5 and Neurons2 had similar markers and were merged Subset again
# faster to make a subset objects of only neurons and use find all markers
neuron.sub <- subset(seu.q, idents = c("Neurons1","Neurons2","Neurons3",
"Neurons4"))
neuron.sub.markers <- FindAllMarkers(neuron.sub)
top5 <- neuron.sub.markers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(neuron.sub, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)
DotPlot(neuron.sub, features = top5$gene) + RotatedAxis()
# neurons 5 was joined to Neurons 2
# cluster markers done again
# Neurons1 : MGP (targeting neural projections BMP signaling), HPD (excitatory and inibitory, could be cell adhesion or apoptosis), MSX1 (transcription factor BMP signaling, midbrain marker, developmental), CYP1B1 (redox homeostatis)
# MGP, HPD, MSX1, CYP1B1
# there isn't a good proportion of cells expression any of the markers, Neurons2 has the best amount
# Neurons2: TFP12 serine protease melatonin conversion, PTN (cytokine signaling), LYgH (inhances nAChRs), S100A10 (modulates serotonin receptor), IFI27 (antiviral activity)
#TFP12,PTN, LY6H, S100A10, IFI27
# Neurons3: SOX4, ASCL1 (neurogenesis),
# SOX4 is a marker of 3 but has high expression in 4 as well
# Neurons4: PCAT4 (not noted in neurons), TPH1 (5HT synthesis), GK5 (neuronal maintainance), SST (somatostatin, GABA spike regulation), TTR (neural protective in AD)
# PCAT4, TPH1, GK5, SST, TTR
# markers to use
# Neurons1: MSX1, CYP1B1
# Neurons2: LY6H, S100A10
# Neurons3: SOX4, ASCL1 (Neurogenesis) Immature
# Neurons4: GK5, SST More mature
# Maybe neurons 1 and 2 could be merged
# lets see how the markers would look
neurons.1and2 <- FindMarkers(neuron.sub, ident.1 = c("Neurons1","Neurons2"),
ident.2 = c("Neurons3","Neurons4"))
top10 <- neurons.1and2 %>% top_n(n=10, wt = avg_log2FC)
ft.up <- rownames(top10) # up in Neurons1 and 3
top10 <- neurons.1and2 %>% top_n(n=-10, wt = avg_log2FC)
ft.down <- rownames(top10)
features <- c(ft.up,ft.down)
DoHeatmap(neuron.sub, features = features, size=3, angle =90, group.bar.height = 0.02)
DotPlot(neuron.sub, features = features) + RotatedAxis()
# all the markers were up regulated in neurons2 and not really neurons1
# I'll keep them separated
Use subgrouping and find cluster markers to look at neuronal subtypes.
# faster to make a subset objects of only neurons and use find all markers
neuron.sub <- subset(seu.q, idents = c("DAneurons1","DAneurons2","DAneurons3"))
neuron.sub.markers <- FindAllMarkers(neuron.sub)
top5 <- neuron.sub.markers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(neuron.sub, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)
DotPlot(neuron.sub, features = top5$gene) + RotatedAxis()
# markers are much clearer for the DA neuron subgroups
# DA neurons 1: WIF1, CYP1B1, IGFBP3, HPD, WFIKKN2
# WIF1 (secreted WNT inhibitor, promotes regeneration), CYP1B1 (redox homeostatis), IGFBP3 (prolactin secretion regulation hypothalmus), HPD (neuro protective), WFIKKN2 (Receptor for TNC)
# DA neurons2: CDH7, RUNX1T1, ASCL1, DLK1, MEG3
# CDH7 (neuro circuitry development, SEMA), RUNX1T1 (neuronal differentiation), ASCL1 (neuronal differentiation), DLK1 (neural differentation), MEG3 (neural homeostatis)
# DA neurons3: PCAT4, NEUROD1, NCKAP5, GK5, SST
# PCAT4 (dendritic growth), NEUROD1 (neural differentation), NCKAP5 (Excitory neurons), GK5 (TH ), SST (regualates spike times)
# DA neurons1: CYP1B1, IGFBP3
# DA neurons2: RUNX1T1, ASCL1
# DA neurons3: NEUROD1, NCKAP5
Name the Neurons2 FACS population with the Neuron subtype latter.
Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("Neurons-MSX1","Neurons-LY6H","Neurons-ASCL1",
"Other","DAneurons-CYP1B1","DAneurons-ASCL1","Neurons-GK5",
"DAneurons-NEUROD1","RG","Neurons-LY6H","Epithelial","Endothelial")
unique(seu.q$RNA_snn_res.0.6)
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$cellsubgroups <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'cellsubgroups', repel = TRUE)
preduction summary Cell_Types
0 neurons Neurons 1 NPC/oligo/astro/neurons/RG Neurons 2 neurons Neurons 3 NPC/RG/Astro Neurons 4 RG/endo RG 5 DA neurons Neurons 6 Neurons Neurons 7 DA neurons Neurons 8 RG/endo Endothelial 9 RG/astro Astro 10 opc/npc/astro/neurons Neurons 11 neurons/dividing/RG Neurons
Clustering higher res res 1.2 0 Neurons 1 Neurons 2 Neurons 3 Astro/RG/Neurons 4 RG/Astro/Neurons 5 RG/Neurons/NPC 6 RG/NPC/Endo 7 Neurons DA 8 Neurons 9 Neurons DA 10 Neurons DA 11 RG/Endo 12 RG/Astro 13 RG/Astro/Neurons 14 Neurons/RG
Idents(seu.q) <- 'RNA_snn_res.1.2'
# highlight the DA neurons
cluster.ids <- c("Neurons","Neurons","NPC",
"Neurons","Neurons","Neurons",
"Neurons DA","Neurons","Neurons DA",
"Neurons DA", "Neurons DA","Endothelial",
"RG/Astro","RG/Astro/Neurons","Neurons/RG")
cluster.ids <- c("Neurons","Neurons","Neurons",
"Neurons","Neurons","NPC",
"Neurons","Neurons","Neurons",
"Neurons", "Neurons","Endothelial",
"Astrocytes","Radial Glia","Radial Glia")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Types <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Types', repel = TRUE)

#DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'RNA_snn_res.1.2', repel = TRUE)
#clustree(seu.q)
# save the Neurons2 with labels
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2LabelsSeu30092022.RDS")
FACS population Glia1 (should be astrocytes)
# explore filtering
seu <- Glia1
seu
#
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 1000)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 500)
VlnPlot(seu, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)
# filter more cells
seu.ft <- subset(seu, subset = nFeature_RNA > 300 & nCount_RNA > 500 & nCount_RNA < 10000)
seu.ft
# still a lot of cells 47295
# will likely remove a lot more with the doublet finder
saveRDS(seu.ft, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1AstroSeu01102022.RDS")
seu.ft <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1AstroSeu01102022.RDS")
seu.ft <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1AstroSeu01102022.RDS")
Doublet finder
suppressMessages(require(DoubletFinder))
# filtering out MALAT1 and mitochondrial genes
seu.ft <- seu.ft[!grepl("MALAT1", rownames(seu.ft)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]
# like in the tutorial I'm following MALAT1 is the top most expressed gene. The top genes are a lot of MT and Ribosomal genes
seu.ft[["percent.rb"]] <- PercentageFeatureSet(seu.ft, pattern = "^RP")
# down sample there are too many cells to run doublet finder
seu.sub <- subset(seu.ft, downsample = 20000)
seu.d = NormalizeData(seu.sub)
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 15)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
nExp <- round(ncol(seu.d) * 0.15) # expect more doublets because there is a lot more cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
# the memory limit is reached here - I could run on compute canada
# For now I'll downsample
# this works
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
DimPlot(seu.d, group.by = DF.name) + NoAxes())
VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
Remove the doublet cells
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu.sub)
# 20000 pre filter
# creates the expected percentage
Repeat workflow with doublet removed data and find clusters for
seu <- NormalizeData(seu.d, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, reduction = "pca", n.neighbors = 43, dims = 1:30)
DimPlot(seu, reduction = "umap")
seu.q <- FindNeighbors(seu, dims = 1:25, k.param = 43)
seu.q <- FindClusters(seu.q, resolution = c(0,0.2,0.4,0.6))
seu.q <- FindClusters(seu.q, resolution = c(0,0.05,0.1,0.8))
library(clustree)
clustree(seu.q)
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.05')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.1')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.2')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.4')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.6')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.8')
Look at some expression markers in a feature plot
# genes reported up in Astrocytes
FeaturePlot(seu.q, features = c("GFAP","S100B","AQP4","SLC1A3","GJA1",
"APOE","TEAD1","GSTA4","SOX9",
"VIM","HMG20A","ALDH1L1"))
# almost no GFAP expression and lots of S100B everywhere
Predict cell types
# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")
# Midbrain AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")
# Midbrain AIW002 60 days in culture
AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")
#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"
# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# clusters don't break up by the predicted cell types
############ another predictions now using the AIW organoids
Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"
anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAIW.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAIW.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid
Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"
anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# save ojbect with predicitons
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1PredictionsSeu01102022.RDS")
Predict with the brain scRNAseq
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")
Idents(seu.r) <- "cell_cluster"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1841 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 752 anchors
Filtering anchors
Retained 108 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Astrocyte_4 Endo_4 Neuron_98 RG_1
7791 3019 2 9188
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
Astrocyte_4 Endo_4 RG_1 Neuron_98
7791 3019 9188 2
DimPlot(seu.q, group.by = 'Bha.mid.stri.pred')

DimPlot(seu.q, group.by = 'subgroups')

# do the predictions differ with the main cell type groups instead of the cluster in the reference data?
Idents(seu.r) <- "cell_type"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1841 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 752 anchors
Filtering anchors
Retained 108 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_type)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
astrocyte endothelial neuron radial glia
3747 7012 2152 7089
DimPlot(seu.q, group.by = 'predicted.id')

# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AIW120 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)
# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AIW60 <-as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
df.top.aiw60$I <- row.names(df.top.aiw60)
# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AST23 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)
### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.Bha <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)
## these are calculated below
### add in the prediction from brain whole brain data Bhaduri midbrain down sampled
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.Bha1 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.Bha1 <- top.pred.celltype.Bha1[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha1) <- NULL
df.top.Bha1$I <- row.names(df.top.Bha1)
pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table <- merge(pred.table, df.top.Bha1, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha1, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table
NA
These predictions are not good. There are several astrocyte markers by expression levels. Everything is predicted as Radial glia or oligo dendrocytes
Try to predict with the whole brain and see if it’s different
Idents(seu.q) <- 'predicted.id'
seu.q$Bha <- Idents(seu.q)
print(table(seu.q$Bha))
Oligo_9 Endo_1 GW20_Astrocyte_39 Astrocyte_4 Endo_2 Astrocyte_1
692 415 1312 8401 4497 2308
Astrocyte_2 Endo_3 Neuron_37 Dividing_5 IPC_34 IPC_29
159 1621 521 24 35 1
GW18_2_45Oligo Dividing_1 Oligo_8 GW19_2_45Outlier Oligo_6
2 5 4 2 1
DimPlot(seu.q, group.by = 'Bha')

DimPlot(seu.q, group.by = 'subgroups')

Try to predict with the astrocyte Kamath data
astro.ref <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/Macosko_Data/PD_astro.Rds")
# need to make PCA and UMAP
astro.ref <- NormalizeData(astro.ref)
astro.ref <- FindVariableFeatures(astro.ref, selection.method = "vst", nfeatures = 2000)
astro.ref <- ScaleData(astro.ref)
astro.ref <- RunPCA(astro.ref)
astro.ref <- RunUMAP(astro.ref, reduction = "pca", n.neighbors = 205, dims = 1:25)
colnames(astro.ref@meta.data)
Idents(astro.ref) <- "Cell_Subtype"
DefaultAssay(astro.ref) <- "RNA"
# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = astro.ref ,query = seu.q, dims = 1:20)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = astro.ref$Cell_Subtype, k.weight = 10)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$astro.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred', label = TRUE)
table(seu.q$astro.pred)
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, NA)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
seu.q$astro.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred.thresh', label = TRUE)
table(seu.q$astro.pred.thresh)
# 19986 Astro_VIM_TNFSRF12A no threshold Astro_GLYATL2 14
# 8376 Astro_VIM_TNFSRF12A with 95% threshold
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()
top.pred.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()
top.pred.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL
Possible predicted in other clusters
clustree(seu.q)

Make the prediction table for high resolution Res 0.8 12 clusters
# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AIW120 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)
# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AIW60 <-as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
df.top.aiw60$I <- row.names(df.top.aiw60)
# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AST23 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)
### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.Bha <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)
## these are calculated below
### add in the prediction from brain whole brain data Bhaduri midbrain down sampled
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$Bha))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.Bha1 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.Bha1 <- top.pred.celltype.Bha1[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha1) <- NULL
df.top.Bha1$I <- row.names(df.top.Bha1)
pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table <- merge(pred.table, df.top.Bha1, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha1, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table
NA
Look at cluster markers
Idents(seu.q) <- 'RNA_snn_res.0.2'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)
top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)
write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Glia1AstrocytesClusterMarkers_new.csv")
# for the res 0.6 the largers groups 0 and 1 don't have great markers and none of the markers are really very good.
# I'll rerun with res 0.2
unique(seu.q$RNA_snn_res.0.2)
# still not much better
Check cell type markers with EnrichR
library(enrichR)
setEnrichrSite("Enrichr") # Human genes
# list of all the databases
# libaries with cell types
db <- c('Allen_Brain_Atlas_up','Descartes_Cell_Types_and_Tissue_2021',
'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')
# enrichr(genes, databases = NULL)
#I'll run the clusters one at a time
N1.c0 <- ClusterMarkers %>% filter(cluster == 5 & avg_log2FC > 0)
genes <- N1.c0$gene
N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
N1.Er.genes.4 <- N1.c0.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4
# cluster 0 - Cell type marker library - Brain astrocyte top hit and embryonic astrocytes
# gene list in term: brain astrocyte EFEMP1;NFIA;LIX1;PSAP;KIF21A;S100B;CRYAB;DKK3
# embryo astrocytes SOX2;BEX1;HMGCS1;PTPRZ1;LIX1;S100B;DKK3;ITM2C
# cluster 1 - stem cell pericyte (brain), stelate, astrocyte
# cluster 2 - hypothalmus, endothelial cells, macrophage
# endothelial CSTB;PRELID1;MT1X;CRIP2;RHOC;TMEM141;MT2A;RPS28;CCDC85B;EIF3I;RBP1;ID1;C4ORF3;ID3;PCBD1;MSX1;PPIC
# cluster 3 - smooth muscle cells
# cluster 4 - NK cells, fibroblasts
# NK cells ITGB1;RAB5C;GSTP1;PDCD5;EEF1B2;TGOLN2;SDCBP;MT2A;LDHA;SNRPD2;YWHAQ;ZNF326;TMSB10;CCDC50
# fibroblasts COL3A1;CALD1;COL6A3
# cluster 5 - endothelial cells, NK cells, CD8+
# cluster 6 - stromal cells eurythroblasts, none-neuronal, oligo
# reran and now there are only 5 clusters
# repeat checking
Cluster 0 - astrocytes Cluster 1 - pericyte astrocyte (weak still) Cluster 2 - endothelial Cluster 3 - smooth muscle Cluster 4 - NK/fibroblast Cluster 5 - endothelial Cluster 6 - non- neuronal
VlnPlot(seu.q, features = c("CD44","ITGB1","S100B"), group.by = 'orig.ident')
VlnPlot(seu.ft, features = c("CD44","ITGB1","S100B"), group.by = 'orig.ident')
Check expression of known markers
Idents(seu.q) <- 'RNA_snn_res.0.2'
feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list) +RotatedAxis()
PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")
DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()
ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")
feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list)+RotatedAxis()
mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6")
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02)
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression
### glia markers
microglia = c("PTPRC","AIF1","ADGRE1") # ADGRE1 is a microglia marker F4/80, CD45 is PTPRC, gene name IBA1 is AIF1
astolgNPCpromicro = c("GFAP","S100B","SLC1A2","MBP","SOX10","SPP1","DCX","NEUROD1","TBR1","PCNA","MKI67","PTPRC","AIF1","ADGRE1")
# note GLT1 is EAAT2 which is SLC1A2 glutatmate transporter
# epithelial
epi = c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1") # e-cadherin is CDH1
DoHeatmap(seu.q, features = astolgNPCpromicro, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = astolgNPCpromicro)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression
DoHeatmap(seu.q, features = epi, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = epi)+RotatedAxis()
# also add Radial glia marker overlap with Glia and Neurons
features <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
"ITGB5","IL6ST")
DoHeatmap(seu.q, features = features, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = features)+RotatedAxis()
No TH expression
clusters 5 VIM highest, S100 B Cluster 4 Cluster 3 has some cells with high OTX2, NES indicates NPC/Precursors Cluster 2 has some SOX2 and PAX6 indicates NPC, VAMP2 indicating neurons, S100B highest and most - indicates astrocytes, also MBP indicates oligos Cluster 1 Cluster 0
Lable the clusters
Idents(seu.q) <- 'RNA_snn_res.0.2'
#seu.q <- BuildClusterTree(seu.q, reorder = TRUE, reorder.numeric = TRUE)
unique(seu.q$RNA_snn_res.0.2)
cluster.ids <- c("Astrocytes1","Astrocytes2","Precursors","RG1","RG2","Endothelial")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)
#DimPlot(seu.q, group.by = 'RNA_snn_res.0.2', label = TRUE)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)
# something weird is going on in the order
#saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")
Compare the Astrocyte groups and get some markers for sub groups
astro.sub.markers <- FindMarkers(seu.q, ident.1 = "Astrocytes1", ident.2 = "Astrocytes2", only.pos = FALSE)
#top5 <- astro.sub.markers %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = c("PLCG2","PTPRZ1","SNHG25","VCAN","LUM","DCN","S1004A"), size=3, angle =90, group.bar.height = 0.02)
astro2 <- rownames(astro.sub.markers %>% filter(avg_log2FC < 0.05))
DotPlot(seu.q, features = c("PLCG2","PTPRZ1","SNHG25","VCAN","LUM","DCN","S1004A")) + RotatedAxis()
DoHeatmap(seu.q, features = astro2[1:15], size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = astro2[1:15]) +RotatedAxis()
# radial glia subtyping
Idents(seu.q) <- ('subgroups')
rg.sub.markers <- FindMarkers(seu.q, ident.1 = "RG1", ident.2 = "RG2", only.pos = FALSE)
top5.up <- rg.sub.markers %>% top_n(n=10, wt = avg_log2FC)
top5.down <- rg.sub.markers %>% top_n(n=-10, wt = avg_log2FC)
ft <- rownames(top5.up)
DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()
ft <- rownames(top5.down)
DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()
npc.markers <- FindMarkers(seu.q, ident.1 = "Precursors", ident.2 = c("Astrocytes2","Astrocytes1"), only.pos = FALSE)
top10.npc <- npc.markers %>% top_n(n=10, wt = avg_log2FC)
npc.markers <- npc.markers %>% filter(avg_log2FC > 0)
dim(npc.markers)
ft <- rownames(top10.npc)
# consider naming precursors as astrocytes3
DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()
npc.markers.rg <- FindMarkers(seu.q, ident.1 = "Precursors", ident.2 = c("RG2","RG1"), only.pos = TRUE)
top10.npc <- npc.markers.rg %>% top_n(n=10, wt = avg_log2FC)
ft <- rownames(top10.npc)
# consider naming precursors as astrocytes3
DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()
## these have more differences from RG than Astrocytes
Add subtype gene ids
DimPlot(seu.q, group.by = 'RNA_snn_res.0.2')
cluster.ids <- c("Astrocytes-PLCG2","Astrocytes-DNC","Astrocytes-IGFBP2",
"RG1-CDKN1C","RG2-TYRP1","Endothelial")
#cluster.ids <- c("Astrocytes1","Astrocytes2","Precursors(Astrocytes)","RG1","RG2","Endothelial")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_types <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_types', repel = TRUE)
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")
# label with main cell type groups
DimPlot(seu.q, group.by = 'RNA_snn_res.0.2')
cluster.ids <- c("Astrocytes","Astrocytes","Astrocytes",
"RG","RG","Endothelial")
#cluster.ids <- c("Astrocytes1","Astrocytes2","Precursors(Astrocytes)","RG1","RG2","Endothelial")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Type <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Type', repel = TRUE)
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")
Label main cell types Res 0.8 predicitons 0 astro 1 Endo/RG/Astro 2 Astrocyte 3 RG/Endo/Astro 4 Astro 5 Astro 6 Epithela, endo, astro, RG 7 Astro 8 Astro 9 Astro/RG/Endo 10 Astro/RG 11 Astro/Neurons 12 Endo/RG.
#DimPlot(seu.q, group.by = 'RNA_snn_res.0.8')
Idents(seu.q) <- 'RNA_snn_res.0.8'
cluster.ids <- c("Astrocytes","Astrocytes-Endo-RG","Astrocytes",
"RG-Endo-Astro","Astro","Astro","Epi-Endo-Astro-RG",
"Astro","Astro","Astro-RG-Endo","Astro-RG","Astro-Neurons","Endo-RG")
cluster.ids <- c("Astrocytes","Astrocytes","Astrocytes",
"Astrocytes","Astrocytes","Astrocytes","Endothelial",
"Astrocytes","Astrocytes","Radial Glia","Radial Glia","Astrocytes","Endothelial")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Type <- Idents(seu.q)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Type', repel = TRUE)

saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")
table(seu.q$Cell_Type)
Astrocytes RG Endothelial Radial Glia
16068 2955 506 471
Quick check the Glial2
# explore filtering
seu <- Glia2
seu
#
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 1000)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 500)
VlnPlot(seu, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)
# filter more cells
seu.ft <- subset(seu, subset = nFeature_RNA > 250 & nCount_RNA > 250 & nCount_RNA < 10000)
seu.ft
VlnPlot(seu.ft, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 2000)
VlnPlot(seu.ft.glia, features = c("CD44","S100B","ITGB1"))
VlnPlot(seu.ft, features = c("CD44","S100B","ITGB1"), group.by = 'orig.ident')
# both glia populations are similar
Levels seem similar in Glia1 and Glia2
Remove doublets and start to process Glia 2
suppressMessages(require(DoubletFinder))
# filtering out MALAT1 and mitochondrial genes
seu.ft <- seu.ft[!grepl("MALAT1", rownames(seu.ft)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]
# like in the tutorial I'm following MALAT1 is the top most expressed gene. The top genes are a lot of MT and Ribosomal genes
seu.ft[["percent.rb"]] <- PercentageFeatureSet(seu.ft, pattern = "^RP")
seu.d = NormalizeData(seu.ft)
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 15)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
nExp <- round(ncol(seu.d) * 0.08) # expect more doublets because there is a lot more cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
# the memory limit is reached here - I could run on compute canada
# For now I'll downsample
# this works
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
DimPlot(seu.d, group.by = DF.name) + NoAxes())
VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu.ft)
Cluster
seu <- NormalizeData(seu.d, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, reduction = "pca", n.neighbors = 25, dims = 1:30)
DimPlot(seu, reduction = "umap")
seu.q <- FindNeighbors(seu, dims = 1:25, k.param = 25)
seu.q <- FindClusters(seu.q, resolution = c(0,0.05,0.2,0.4,0.5,0.6,0.8))
library(clustree)
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.05')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.1')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.2')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.4')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.6')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.8')
clustree(seu.q)
# 0.4 is likely the best annotate subgroups
Predict cell types
# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")
# Midbrain AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")
# Midbrain AIW002 60 days in culture
AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")
#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"
# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)
# see how accurate the predictions are
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "None")
Idents(seu.q) <- 'predicted.id'
seu.q$MBOAST23.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'predicted.id', label = TRUE)
table(seu.q$MBOAST23.pred)
table(seu.q$MBOAST23.thresh)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.4, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# clusters don't break up by the predicted cell types
############ another predictions now using the AIW organoids
Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"
anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW120.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW120.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.4, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# see how accurate the predictions are
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "None")
Idents(seu.q) <- 'predicted.id'
seu.q$AIW120.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW120.thresh', label = TRUE)
table(seu.q$AIW120.pred)
table(seu.q$AIW120.thresh)
# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid
Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"
anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.4, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")
# see how accurate the predictions are
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "None")
Idents(seu.q) <- 'predicted.id'
seu.q$AIW60.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.thresh', label = TRUE)
table(seu.q$AIW60.pred)
table(seu.q$AIW60.thresh)
# most of the cells are predicted as NPCs in many populations
# save with predictions so far
#saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")
seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")
See how many cells are predicted as astrocytes with the threshold
DAsubtypes <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/Macosko_Data/DAsubgroups_processed.Rds")
Idents(astro.ref) <- "Cell_Subtype"
DefaultAssay(astro.ref) <- "RNA"
# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = DAsubtypes ,query = seu.q, dims = 1:20)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = astro.ref$Cell_Subtype, k.weight = 10)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$astro.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred', label = TRUE)
table(seu.q$astro.pred)
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
seu.q$astro.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred.thresh', label = TRUE)
table(seu.q$astro.pred.thresh)
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()
top.pred.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()
top.pred.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL
# a lot of these cells are also getting labelled as astrocytes
Do these get labelled as DA neurons too???
seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")
DAsubtypes <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/Macosko_Data/DAsubgroups_processed.Rds")
Idents(DAsubtypes) <- "Cell_Subtype"
da.ref <- subset(DAsubtypes, downsample = 500)
# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = da.ref, query = seu.q, dims = 1:20)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = da.ref$Cell_Subtype, k.weight = 10)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$da.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'da.pred', label = TRUE)
table(seu.q$da.pred)
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$da.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'da.pred.thresh', label = TRUE)
table(seu.q$da.pred.thresh)
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$da.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()
top.pred.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.da$Freq),]
row.names(df.top.astro) <- NULL
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()
top.pred.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL
# after thresholding very few cells are predicted as neurons
{redicte with the brain scRNAseq
pathway <- "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/"
seu.q <- readRDS(paste(pathway,"Glia2LabledSeu03102022.RDS",sep = ""))
# midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")
Idents(seu.r) <- "cell_cluster"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1841 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 881 anchors
Filtering anchors
Retained 303 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Astrocyte_4 Endo_11 Endo_4 Endo_7 Neuron_2 Neuron_37 Neuron_98
5810 4 506 186 41 6 396
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
Astrocyte_4 Neuron_2 Neuron_98 Endo_4 Endo_7 Endo_11 Neuron_37
5810 41 396 506 186 4 6
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh', label = TRUE)

table(seu.q$Bha.mid.pred.thresh)
none Astrocyte_4
6942 7
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh')

# read in the reference dataset
# whole brain Bhaduri down sampled
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_downsample.RDS")
Idents(seu.r) <- "cell_cluster"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1844 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 1726 anchors
Filtering anchors
Retained 509 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Astrocyte_1 Astrocyte_2 Astrocyte_4 Astrocyte_6 Dividing_21
480 203 1310 2 136
Dividing_3 Endo_1 Endo_11 Endo_2 Endo_3
7 85 19 522 1
Endo_4 Endo_7 Endo_8 GW19_2_45Outlier GW19_Astrocyte_35
2 3 125 1604 1
GW20_Astrocyte_39 GW22both_RG_23 Interneuron_7 IPC_29 IPC_34
138 252 192 747 350
Microglia_6 Neuron_5 Neuron_93 Oligo_12 Oligo_9
5 6 253 11 1
RG_1 RG_11 RG_6
492 1 1
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.pred <- Idents(seu.q)
print(table(seu.q$Bha.pred))
Astrocyte_2 GW19_2_45Outlier Endo_8 IPC_29 IPC_34
203 1604 125 747 350
Interneuron_7 Dividing_21 Astrocyte_4 GW20_Astrocyte_39 Astrocyte_1
192 136 1310 138 480
Neuron_93 GW22both_RG_23 Endo_2 Endo_11 Endo_1
253 252 522 19 85
RG_1 Neuron_5 Endo_3 Microglia_6 Dividing_3
492 6 1 5 7
Oligo_12 RG_6 Endo_7 Endo_4 Astrocyte_6
11 1 3 2 2
GW19_Astrocyte_35 Oligo_9 RG_11
1 1 1
DimPlot(seu.q, group.by = 'Bha.pred')
DimPlot(seu.q, group.by = 'subgroups')
Compare predictions - make a predictions table
# AIW002 120 days predictions - take the thresholded options
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$AIW120.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AIW120 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)
# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$AIW60.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AIW60 <-as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
df.top.aiw60$I <- row.names(df.top.aiw60)
# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$MBOAST23.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.AST23 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)
# add the threshold Astro predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.astro <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.astro <- top.pred.celltype.astro[order(top.pred.celltype.astro$Var1,-top.pred.celltype.astro$Freq),]
row.names(df.top.astro) <- NULL
df.top.astro$I <- row.names(df.top.astro)
# add the neurons predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$da.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.da <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.da <- top.pred.celltype.da[order(top.pred.celltype.da$Var1,-top.pred.celltype.da$Freq),]
row.names(df.top.da) <- NULL
df.top.da$I <- row.names(df.top.da)
### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.Bha <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)
## these are calculated below
### add in the prediction from brain whole brain data Bhaduri midbrain down sampled
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype.Bha1 <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top.Bha1 <- top.pred.celltype.Bha1[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha1) <- NULL
df.top.Bha1$I <- row.names(df.top.Bha1)
pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table <- merge(pred.table, df.top.Bha1, by = 'I')
Warning in merge.data.frame(pred.table, df.top.Bha1, by = "I") :
column names ‘Var1.x’, ‘Var2.x’, ‘Freq.x’, ‘Var1.y’, ‘Var2.y’, ‘Freq.y’ are duplicated in the result
pred.table
NA
NA
NA
Predicted cluster annotations 0 Unknown/ NPC 1 RG 2 astro 3 RG 4 neurons 5 RG
Predict from developing cortex
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1389 features as input.
Projecting cell embeddings
Finding neighborhoods
Error in base::options(...future.oldOptions) : invalid argument
The dev cortex doesn’t predict Glia 2 well Try the developing forebrain
DefaultAssay(seu.r) <- 'RNA'
# clusters has subgroups
# levels are main cell groups
Idents(seu.r) <- "Level1"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1851 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 585 anchors
Filtering anchors
Retained 250 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$Level1)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
EarlyInhibitory Glioblast NeuralProgenitor RadialGlia VLMC
7 2893 1299 2630 120
Idents(seu.q) <- 'predicted.id'
seu.q$fb.pred <- Idents(seu.q)
print(table(seu.q$fb.pred))
RadialGlia Glioblast NeuralProgenitor EarlyInhibitory VLMC
2630 2893 1299 7 120
DimPlot(seu.q, group.by = 'fb.pred')

# add the threshold
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.80, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$fb.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'fb.thresh', label = TRUE)

table(seu.q$fb.thresh)
RadialGlia none VLMC NeuralProgenitor
1028 5659 113 149
# make the tables
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$fb.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(2, Freq))
df.top <- top.pred.celltype[order(top.pred.celltype$Var1,-top.pred.celltype$Freq),]
row.names(df.top) <- NULL
df.top$I <- row.names(df.top)
#VLMC is vascular and leptomeninges
# almost everything is predicted as 'none' when theshold is .95
# run again with lower threshold 0.8 and many are predicted as RG
# try predicting with the cluster labels
# later I can subset cell types for the reference data
Idents(seu.r) <- "Clusters"
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
Performing PCA on the provided reference using 1851 features as input.
Projecting cell embeddings
Finding neighborhoods
Finding anchors
Found 585 anchors
Filtering anchors
Retained 250 anchors
print("getting predictions")
[1] "getting predictions"
predictions <- TransferData(anchorset = anchors, refdata = seu.r$Clusters)
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Predicting cell labels
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Excitatory neurons possibly midbrain Glioblast/Pre-OPC
1947 4
OPCs Radial Glia VLMC primed?
1 2931
Radial glia/Glioblast/Forebrain progenitor EMX1 Striatum/Cortical neurons
75 1871
VLMCs
120
Idents(seu.q) <- 'predicted.id'
seu.q$fb.sub.pred <- Idents(seu.q)
print(table(seu.q$fb.sub.pred))
Radial Glia VLMC primed? Excitatory neurons possibly midbrain
2931 1947
Striatum/Cortical neurons Radial glia/Glioblast/Forebrain progenitor EMX1
1871 75
VLMCs OPCs
120 1
Glioblast/Pre-OPC
4
DimPlot(seu.q, group.by = 'fb.sub.pred')

# add the threshold
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.80, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$fb.sub.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'fb.sub.thresh', label = TRUE)

table(seu.q$fb.sub.thresh)
Radial Glia VLMC primed? none
1015 5727
VLMCs Excitatory neurons possibly midbrain
113 94
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$fb.sub.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis()

top.pred.celltype <- as.data.frame(t.lables %>% group_by(Var1) %>% top_n(4, Freq))
df.top <- top.pred.celltype[order(top.pred.celltype$Var1,-top.pred.celltype$Freq),]
row.names(df.top) <- NULL
df.top$I <- row.names(df.top)
df.top.ft <- df.top %>% filter(Freq > 0)
Predictions with thresholds:
0 - RG 1 - RG 2 - none 3 - RG 4 - Neural precursor 5 - VLMC (vascular and leptomeninges)
Look at gene lists with known markers
Idents(seu.q) <- 'RNA_snn_res.0.2'
# many cell types list
feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list) +RotatedAxis()
# Dopaminergic markers
PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")
DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()
ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")
feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list)+RotatedAxis()
mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6")
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02)
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression
### glia markers
microglia = c("PTPRC","AIF1","ADGRE1") # ADGRE1 is a microglia marker F4/80, CD45 is PTPRC, gene name IBA1 is AIF1
astolgNPCpromicro = c("GFAP","S100B","SLC1A2","MBP","SOX10","SPP1","DCX","NEUROD1","TBR1","PCNA","MKI67","PTPRC","AIF1","ADGRE1")
# note GLT1 is EAAT2 which is SLC1A2 glutatmate transporter
# epithelial
epi = c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1") # e-cadherin is CDH1
DoHeatmap(seu.q, features = astolgNPCpromicro, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = astolgNPCpromicro)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression
DoHeatmap(seu.q, features = epi, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = epi)+RotatedAxis()
# also add Radial glia marker overlap with Glia and Neurons
features <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
"ITGB5","IL6ST")
DoHeatmap(seu.q, features = features, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = features)+RotatedAxis()
# radial glia markers
rg <- c("VIM","NES","PAX6","HES1","EAAT1","NCAD1","SOX2","FABP7")
DoHeatmap(seu.q, features = rg, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = rg)+RotatedAxis()
# NPC and radial glia are very similar
Marker expression predictions Cluster 0 - unknown Cluster 1 - RG Cluster 2 - unknown Cluster 3 - RG cluster 4 - immature neurons Cluster 5 - RG, opc
Check the levels of RNA in each cluster
VlnPlot(seu.q, features = "nFeature_RNA")
Cluster 0 and 2 have fewer sequences than other groups and thus no markers Possibly remove these is they don’t come up with some markers
Find cluster markers
Idents(seu.q) <- 'RNA_snn_res.0.2'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)
top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)
#write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Glia2RGClusterMarkers_new.csv")
Idents(seu.q) <- 'RNA_snn_res.0.1'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)
top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)
Markers of 2 are matching with 5 possibly merge these together Cluster 0 markers don’t look up regulated but the list is long
Look at the libraries
library(enrichR)
setEnrichrSite("Enrichr") # Human genes
# list of all the databases
# libaries with cell types
db <- c('Descartes_Cell_Types_and_Tissue_2021',
'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')
# enrichr(genes, databases = NULL)
#I'll run the clusters one at a time
N1.c0 <- ClusterMarkers %>% filter(cluster == 0 & avg_log2FC > 0)
genes <- N1.c0$gene
N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1
N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2
N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3
The adult brain doesn’t predict radial glia - there are radial glia but I believe these are different from the ‘neuroblast’ type radial glia
I will use a developing brain reference
DimPlot(seu.q)
seu.q <- readRDS(paste(pathway,"Glia2LabledSeu03102022.RDS",sep = ""))
# midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")
Idents(seu.r) <- "cell_cluster"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh', label = TRUE)
table(seu.q$Bha.mid.pred.thresh)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh')
# read in the reference dataset
# whole brain Bhaduri down sampled
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_downsample.RDS")
Idents(seu.r) <- "cell_cluster"
# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.pred <- Idents(seu.q)
print(table(seu.q$Bha.pred))
DimPlot(seu.q, group.by = 'Bha.pred')
DimPlot(seu.q, group.by = 'subgroups')
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.pred.thresh', label = TRUE)
table(seu.q$Bha.mid.pred.thresh)
DimPlot(seu.q, group.by = 'Bha.pred.thresh')
Add some cell type annotations
Idents(seu.q) <- 'RNA_snn_res.0.2'
cluster.ids <- c("Glia1","RG1","Glia2","RG2","NeuronsImmature","RG3")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)
#DimPlot(seu.q, group.by = 'RNA_snn_res.0.2', label = TRUE)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)
# save file
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")
Main cell groups
Idents(seu.q) <- 'RNA_snn_res.0.2'
cluster.ids <- c("Radial Glia","Radial Glia","Radial Glia","Radial Glia","NPC","Other")
names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Types <- Idents(seu.q)
#DimPlot(seu.q, group.by = 'RNA_snn_res.0.2', label = TRUE)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Types', repel = TRUE)

saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")
Proportions of cell types
table(seu.q$Cell_Types)
dim(seu.q)
prp <- as.data.frame(table(seu.q$Cell_Types))
prp
prp$prop <- prp$Freq/sum(prp$Freq)*100
prp$Sample <- 'RadialGlia'
prp
I’ll calculate the proportions for each cell type and make a table or plot in the comparison workbook.
---
title: "R Notebook"
output: html_notebook
---
Single cell seq after sorting for PhenoID

sample1 = Neurons1
sample2 = Neurons2
sample3 = Glia1 - Astrocytes (CD44+)
sample4 = Glia2 - Radial Glia (CD44-)

In HPC I have run steps of scrnabox (custom pipeline in progress)
1. Cell Ranger for feature seq
2. Create Seurat Objects 
3. Apply minimum filtering and calculate percent mitochondria.

I have technical 3 replicates with hashtag labels at this point I haven't yet demultiplex the hashtags. The data here will be treated as one sample.  I sorted three separate samples and pooled them together. 


```{r}
# set up the environment

library(Seurat)
library(dplyr)
library(Matrix)
library(ggplot2)

#rm(list = ls())


```


Read in the seurat objects made in compute canada

```{r}

# this seems to never load I'll use step 3 output that has some filtering 
# nFeature_RNA > 180 and percent.mt < 25

pathway <- "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/"

Neurons1 <- readRDS(paste(pathway,"seu1.rds",sep = ""))
Neurons2 <- readRDS(paste(pathway,"seu2.rds",sep = ""))
Glia1 <- readRDS(paste(pathway,"seu3.rds",sep = ""))
Glia2 <- readRDS(paste(pathway,"seu4.rds",sep = ""))

Neurons1
Neurons2
Glia1
Glia2


```


Have a look at the objects that already have some filtering


See the violin plots 

```{r}

VlnPlot(Neurons1, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

VlnPlot(Neurons1, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 500)
VlnPlot(Neurons1, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)


# filter more cells

Neuron1.ft <- subset(Neurons1, subset = nFeature_RNA > 250 & nCount_RNA > 250 & nCount_RNA < 10000) 
Neuron1.ft

# 33541 features across 1833 samples


```


Filters out specific genes

```{r}

# filter out MALAT1 super high expression

seu.ft <- seu[!grepl("MALAT1", rownames(seu)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]

# this filtered object might cluster differently


```

PCA and UMAP

```{r}

seu.q <- NormalizeData(seu.ft, normalization.method = "LogNormalize", scale.factor = 10000)
seu.q <- FindVariableFeatures(seu.q, selection.method = "vst", nfeatures = 2000)
seu.q <- ScaleData(seu.q)
seu.q <- RunPCA(seu.q)
seu.q <- RunUMAP(seu.q, reduction = "pca", n.neighbors = 25, dims = 1:30, min.dist = 0.25, spread = 2)

```


Try to find doublets with doublet finder

```{r}
remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
suppressMessages(require(DoubletFinder))


```

```{r}

seu.d = FindVariableFeatures(seu.q, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
    verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 20)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)

nExp <- round(ncol(seu.d) * 0.06)  # expect 6% doublets
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)


# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]



cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
    DimPlot(seu.d, group.by = DF.name) + NoAxes())


```

Do the double cells have more genes than the singlet??

```{r}

VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
# yes 

```

Remove the doublets

```{r}

seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu)

# removed about 100 cells


```


Run clustering

```{r}

seu.q <- FindNeighbors(seu.d, dims = 1:25, k.param = 43)
seu.q <- FindClusters(seu.q, resolution = c(0,0.2,0.4,0.6))
seu.q <- FindClusters(seu.q, resolution = c(1.2))

library(clustree)
clustree(seu.q, prefix = "RNA_snn_res.")
DimPlot(seu.q)

```

Look at the predictions of cell types in seurat label transfer

3 organoid datasets


```{r}


# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")

# Midbrain  AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")

# Midbrain AIW002 60 days in culture

AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")


# query
#seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/NeuronsFilteredSeu28092022.RDS")


#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)


# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# clusters don't break up by the predicted cell types

############ another predictions now using the AIW organoids

Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"

anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAIW.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAIW.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)


# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid

Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"

anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids) 
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)

# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# save ojbect with predicitons
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2PredictionsSeu30092022.RDS")



```


Look at the cell type predictions in each cluster - Neurons 1

```{r}
# AIW002 160 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(1, Freq))

# AIW002 160 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(1, Freq))

# AST23 65 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(1, Freq))


pred.table <- merge(top.pred.celltype.AIW120,top.pred.celltype.AIW60, by = 'Var1')
pred.table <- merge(pred.table, top.pred.celltype.AST23, by = 'Var1')
pred.table

```

Based on the 3 different predictions I can lable the cell types

0 - NPC or early neurons
1 - immature excitatory neurons
2 - NPC or early neurons
3 - RG or Oligos
4- Dopaminergic neurons - possibly early
5 - NPC or early neurons
6 - Radial GLia

Predictions with the DA subtypes from 

```{r}
# read in the 
seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neuron1LabledSeu30092022.RDS")

DimPlot(seu.q)

```

Use the Bhaduri dataset

```{r}

# read in the reference dataset
# from Bhaduri midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")
table(seu.r$cell_type)
table(seu.r$cell_class)
table(seu.r$cell_cluster)

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
DimPlot(seu.q, group.by = 'Bha.mid.stri.pred')
DimPlot(seu.q, group.by = 'subgroups')

# do the predictions differ with the main cell type groups instead of the cluster in the reference data? 
Idents(seu.r) <- "cell_type"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_type)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
DimPlot(seu.q, group.by = 'predicted.id')

# good largest prediction is neurons. 



```





I will also find markers and look at a list of neuronal markers

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.6'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)

top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')

write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Neurons1ClusterMarkers7.csv")


```

Explore some Gene expression levels

```{r}
feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")

DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = feature_list) +RotatedAxis()

PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")

DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()

ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")

feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = feature_list)+RotatedAxis()
# no proliferation marker expression  PCNA or MKI67
# cluster 4 DA neurons - shows early neuron marker and low PAX 4
# cluster 3 has higher SOX2 - neuroblast marker / NPC marker

mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6") 
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 



```


Checkout the Enricher cell type libraries from 

```{r}
# test markers for the 7 clusters in Neurons1 

library(devtools)
install_github("wjawaid/enrichR")
library(enrichR)


setEnrichrSite("Enrichr") # Human genes
# list of all the databases

dbs <- listEnrichrDbs()
dbs
# libaries with cell types

db <- c('Allen_Brain_Atlas_up','Descartes_Cell_Types_and_Tissue_2021',
        'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')

# enrichr(genes, databases = NULL)

N1.c0 <- ClusterMarkers %>% filter(cluster == 0 & avg_log2FC > 0)
genes <- N1.c0$gene

N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

# cluster 0 could be hypothalmus, DA neurons A13

N1.c1 <- ClusterMarkers %>% filter(cluster == 1 & avg_log2FC > 0)
genes <- N1.c1$gene

N1.c1.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c1.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c1.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c1.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c1.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c1.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 1; olfactory bulb, neural plate, maybe Radial Glia, 
N1.c2 <- ClusterMarkers %>% filter(cluster == 2 & avg_log2FC > 0)
genes <- N1.c2$gene

N1.c2.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c2.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c2.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c2.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c2.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c2.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 2 some brain nucleus, neural stem

N1.c3 <- ClusterMarkers %>% filter(cluster == 3 & avg_log2FC > 0)
genes <- N1.c3$gene

N1.c3.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c3.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c3.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c3.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c3.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c3.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 3 stromal cell of thymus, embryonic astrocytes, OPC, NK cells, monocytes

N1.c4 <- ClusterMarkers %>% filter(cluster == 4 & avg_log2FC > 0)
genes <- N1.c4$gene

N1.c4.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c4.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c4.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c4.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c4.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c4.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# Dentate gyrus - different cortical layers, neurons, neurons, NPC, neurons GABA,GLUT

N1.c5 <- ClusterMarkers %>% filter(cluster == 5 & avg_log2FC > 0)
genes <- N1.c5$gene

N1.c5.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c5.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c5.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c5.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c5.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c5.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 5 hippocampus, endothelial cells, pericytes

N1.c6 <- ClusterMarkers %>% filter(cluster == 6 & avg_log2FC > 0)
genes <- N1.c6$gene

N1.c6.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c6.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c6.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c6.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c6.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c6.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 6 brain cortex, shwann cell, endothelial, pericyte, GABA


```


After predicting with the brain data I think using a higher cluster number will work better


```{r}

# res1.2 has 10 clusters


# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)

# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
# something went wrong and there are too many cluster 0 predictions
df.top.aiw60 <- df.top.aiw60 %>%  filter(!row_number() %in% c(2, 3, 4))
df.top.aiw60$I <- row.names(df.top.aiw60)

# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)
df.top.AST23 <- df.top.aiw60 %>%  filter(!row_number() %in% c(22))

### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)


pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
pred.table

# cluster 3 is predicted as Oligo, RG, astro

```




Library of tissue cell types for up regulated genes per cluster
0 - hypothalmus, DA A13
1- neural plate, Radial Glia
2 - Neural stem
3 - stromal, astro OPC
4 - Neurons
5 - endothelial, pericyte
6 - maybe neurons maybe not






By the combined information - annotate the clusters in Neurons1

```{r}
#Based on the 3 different predictions I can lable the cell types

#0 - NPC or early neurons
#1 - immature excitatory neurons
#2 - NPC or early neurons
#3 - RG or Oligos
#4- Dopaminergic neurons - possibly early
#5 - NPC or early neurons
#6 - Radial Glia

Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("ImmatureNeurons","Neurons","NPC","OPC-RG","DAneurons",
                 "Other","RG")
unique(seu.q$RNA_snn_res.0.6)

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)


saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neuron1LabledSeu30092022.RDS")



```

```{r}
library(clustree)
clustree(seu.q)

```




After predicting with the brain data Bhaduri Midbrain and Striatum - choose main cell types to label
Tried the predictions with more clusters 0-9 res 1.2

```{r}


#0 - NPC or early neurons          = 0,7,2
#1 - immature excitatory neurons = 1,8
#2 - NPC or early neurons         = 4,6
#3 - RG or Oligos                 9,2,7
#4- Dopaminergic neurons - possibly early   = 3
#5 - NPC or early neurons   = 5
#6 - Radial Glia   = 9 


# 0 - neurons (NPC)
# 1 - neurons (NPC)
# 2 - astro (NPC)
# 3 - neurons (DA)
# 4 - neurons (NPC)
# 5 - Endothelial (predicted from Bha, predicted as NPC)
# 6 - Neurons/ Glia
# 7 - Astrocytes RG
# 8 - Neurons
# 9 - Endothelial RG, neurons

Idents(seu.q) <- 'RNA_snn_res.1.2'
cluster.ids <- c("Neurons","Neurons","Neurons","Neurons","Neurons",
                 "Endothelial","Neurons","Glia","Neurons","Radial Glia")
unique(seu.q$RNA_snn_res.0.6)

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Types <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Types', repel = TRUE)

saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neuron1LabledSeu30092022.RDS")


```








### Next Repeat everything for Neurons2

```{r}
# explore filtering
seu <- Neurons2
seu
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 2000)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 350)
VlnPlot(seu, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)

# filter more cells

seu.ft <- subset(seu, subset = nFeature_RNA > 300 & nCount_RNA > 500 & nCount_RNA < 10000) 
seu.ft

# 17604 samples with 250 nFeature_RNA
# 9657 with  nFeature 300 and nCOunt 500


```


Doublet finder 

```{r}
suppressMessages(require(DoubletFinder))

# filtering out MALAT1 and mitochondrial genes

seu.ft <- seu.ft[!grepl("MALAT1", rownames(seu)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]

# like in the tutorial I'm following MALAT1 is the top most expressed gene.  The top genes are a lot of MT and Ribosomal genes

seu.ft[["percent.rb"]] <- PercentageFeatureSet(seu.ft, pattern = "^RP")

seu.d = NormalizeData(seu.ft)
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
    verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 30)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)

nExp <- round(ncol(seu.d) * 0.08)  # expect more doublets because there is a lot more cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)


# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]


cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
    DimPlot(seu.d, group.by = DF.name) + NoAxes())

VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)

```


Remove the doublet cells

```{r}

seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu)
# 9657 cells pre filter
# 8884 cells after filtering
# note the percent doubles expected is close to the percent detected

```


Repeat workflow with doublet removed data and find clusters for 

```{r}


seu <- NormalizeData(seu.d, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, reduction = "pca", n.neighbors = 43, dims = 1:30)
DimPlot(seu, reduction = "umap")

seu.q <- FindNeighbors(seu, dims = 1:25, k.param = 43)
seu.q <- FindClusters(seu.q, resolution = c(0,0.2,0.4,0.6))

library(clustree)
clustree(seu.q)

DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.2')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.4')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.6')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.1.2')

```

Label cell types using the label transfer

```{r}



# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")

# Midbrain  AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")

# Midbrain AIW002 60 days in culture

AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")


# query
#seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/NeuronsFilteredSeu28092022.RDS")


#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)


# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# clusters don't break up by the predicted cell types

############ another predictions now using the AIW organoids

Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"

anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAIW.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAIW.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)


# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid

Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"

anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids) 
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id))
pr.t.lables <- as.data.frame(prop.table(table(seu.q$RNA_snn_res.0.2, seu.q$predicted.id)))
t.lables$Freq <- as.double(t.lables$Freq)


# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# save ojbect with predicitons
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2PredictionsSeu30092022.RDS")



```


Predict from Brain Bhahani

```{r}

seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2LabelsSeu30092022.RDS")


# read in the reference dataset
# from Bhaduri midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")
table(seu.r$cell_type)
table(seu.r$cell_class)
table(seu.r$cell_cluster)

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
DimPlot(seu.q, group.by = 'Bha.mid.stri.pred')
DimPlot(seu.q, group.by = 'subgroups')

# do the predictions differ with the main cell type groups instead of the cluster in the reference data? 
Idents(seu.r) <- "cell_type"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_type)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
DimPlot(seu.q, group.by = 'predicted.id')

# good largest prediction is neurons. 


```



See the top predictions for each cluster in Neurons2 res 06 


```{r}

# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120 <- df.top.aiw120 %>%  filter(!row_number() %in% c(20, 21, 22, 23, 24))
df.top.aiw120$I <- row.names(df.top.aiw120)

# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
# something went wrong and there are too many cluster 0 predictions
#df.top.aiw60 <- df.top.aiw60 %>%  filter(!row_number() %in% c(2, 3, 4))
df.top.aiw60$I <- row.names(df.top.aiw60)

# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23 <- df.top.AST23 %>%  filter(!row_number() %in% c(20, 21, 22, 23, 24, 25))
df.top.AST23$I <- row.names(df.top.AST23)

### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.6, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)


pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
pred.table

# cluster 3 is predicted as Oligo, RG, astro




```


```{r}
# res1.2 has 15 clusters

# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120 <- df.top.aiw120 %>%  filter(!row_number() %in% c(26, 27, 28, 29, 30))
df.top.aiw120$I <- row.names(df.top.aiw120)

# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
# something went wrong and there are too many cluster 0 predictions
#df.top.aiw60 <- df.top.aiw60 %>%  filter(!row_number() %in% c(2, 3, 4))
df.top.aiw60$I <- row.names(df.top.aiw60)

# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23 <- df.top.AST23 %>%  filter(!row_number() %in% c(26, 27, 28, 29, 30, 31))
df.top.AST23$I <- row.names(df.top.AST23)


### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.1.2, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)


pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
pred.table

# neurons 98 matches up with DA neurons

```




What cell types are predicted across the 3 references

0 - Neurons early , NPC, neurons excitatory
1 - Neurons early, NPC
2 - Neurons early, NPC, neurons excitatory some DA neurons
3 - Oligo, RG, 
4 - Excitatory neurons, NPC, early neurons
5 - DA neurons, early DA neurons
6 - neurons immature NPC
7 - DA neurons
8 - RG, oligo, OPC, NPC
9 - Radial Glia
10 - NPC, neurons, oligo
11 - NPC, neurons, oligo


Find cluster markers and see how those would annotate

```{r}
Idents(seu.q) <- 'RNA_snn_res.0.6'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)

top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')

write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Neurons1ClusterMarkers11.csv")

```
Cluster 0 has fewer markers. 
2 and 5 have similar up reg markers
3 and 4 also overlap



Look at the cluster markers in cell type libraries for Neurons 2

```{r}
library(enrichR)
db <- c('Allen_Brain_Atlas_up','Descartes_Cell_Types_and_Tissue_2021',
        'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')

# enrichr(genes, databases = NULL)
# cluster 0

N1.c0 <- ClusterMarkers %>% filter(cluster == 0 & avg_log2FC > 0)
genes <- N1.c0$gene

N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

# cluster 1

N1.c1 <- ClusterMarkers %>% filter(cluster == 1 & avg_log2FC > 0)
genes <- N1.c1$gene

N1.c1.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c1.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c1.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c1.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c1.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c1.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c1.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 1; olfactory bulb, neural plate, maybe Radial Glia, 
N1.c2 <- ClusterMarkers %>% filter(cluster == 2 & avg_log2FC > 0)
genes <- N1.c2$gene

N1.c2.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c2.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c2.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c2.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c2.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c2.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c2.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 3

N1.c3 <- ClusterMarkers %>% filter(cluster == 3 & avg_log2FC > 0)
genes <- N1.c3$gene

N1.c3.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c3.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c3.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c3.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c3.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c3.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c3.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 4

N1.c4 <- ClusterMarkers %>% filter(cluster == 4 & avg_log2FC > 0)
genes <- N1.c4$gene

N1.c4.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c4.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c4.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c4.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c4.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c4.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c4.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 5

N1.c5 <- ClusterMarkers %>% filter(cluster == 5 & avg_log2FC > 0)
genes <- N1.c5$gene

N1.c5.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c5.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c5.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c5.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c5.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c5.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c5.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4

# cluster 6

N1.c6 <- ClusterMarkers %>% filter(cluster == 6 & avg_log2FC > 0)
genes <- N1.c6$gene

N1.c6.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c6.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c6.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c6.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c6.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c6.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4


# other clusters - change the cluster number
N1.c6 <- ClusterMarkers %>% filter(cluster == 11 & avg_log2FC > 0)
genes <- N1.c6$gene

N1.c6.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c6.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c6.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c6.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c6.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c6.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3

N1.Er.genes.4 <- N1.c6.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4


```

cluster 0 - astrocyte, radial glia, microglia, striatum - very few genes in these terms
cluster 1 - adrenal, thalmus, endothelial, astrocytes, neurons
clusters 2 - neural plate, stratum, neurons, NPC, stem, astro,neuroendocrine
cluster 3 - brain molecular layer, endothelial, astrocyte embryonic, 
cluster 4 - DG, striatum CA3, GABAergic neurons
cluster 5 - DG, neurons, Glutamatergic neurons
cluster 6 - neurons, astrocyte, microglia, GABAneurons
cluster 7 - neurons, glut and gaba
cluster 8 - astrocyte, endothelial, pericyte
cluster 9 - epithelial, embryonic astrocytes, GABA neurons
cluster 10 - epithelial
cluster 11 - endothelial, immune cells T cells


Expression of markers genes in Neurons2 

```{r}

feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")

DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = feature_list) +RotatedAxis()

PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")

DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()

ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")

feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = feature_list)+RotatedAxis()
# no proliferation marker expression  PCNA or MKI67
# cluster 4 DA neurons - shows early neuron marker and low PAX 4
# cluster 3 has higher SOX2 - neuroblast marker / NPC marker

mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6") 
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 

### glia markers
microglia = c("PTPRC","AIF1","ADGRE1")  # ADGRE1 is a microglia marker F4/80, CD45 is PTPRC, gene name IBA1 is AIF1
astolgNPCpromicro = c("GFAP","S100B","SLC1A2","MBP","SOX10","SPP1","DCX","NEUROD1","TBR1","PCNA","MKI67","PTPRC","AIF1","ADGRE1")
# note GLT1 is EAAT2 which is SLC1A2 glutatmate transporter
# epithelial
epi = c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1") # e-cadherin is CDH1

DoHeatmap(seu.q, features = astolgNPCpromicro, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = astolgNPCpromicro, group.by = 'RNA_snn_res.0.6')+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 
DoHeatmap(seu.q, features = epi, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = epi, group.by = 'RNA_snn_res.0.6')+RotatedAxis()

# also add Radial glia marker overlap with Glia and Neurons

features <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
              "ITGB5","IL6ST")
DoHeatmap(seu.q, features = features, size=3, angle =90, group.bar.height = 0.02, group.by = 'RNA_snn_res.0.6')
DotPlot(seu.q, features = features, group.by = 'RNA_snn_res.0.6')+RotatedAxis()


```


Label the Neuron2 FACS population 

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("Neurons1","immatureNeurons1","Neurons2",
                 "Other","DAneurons1","DAneurons2","immatureNeurons2",
                 "DAneurons3","RG","immatureNeurons1","Epithelial","Endothelial")
unique(seu.q$RNA_snn_res.0.6)

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)


# label again with just numbering
# then I'll find markers in pairs to distinguish groups. 

Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("Neurons1","Neurons2","Neurons3",
                 "Other","DAneurons1","DAneurons2","Neurons4",
                 "DAneurons3","RG","Neurons2","Epithelial","Endothelial")
unique(seu.q$RNA_snn_res.0.6)

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$number.groups <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'number.groups', repel = TRUE)


saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2LabelsSeu30092022.RDS")



```



Find markers in pairs to go back and classify the subgroups.
Will need to return to this for Neurons1 FACS

Neurons 5 and Neurons2 had similar markers and were merged
Subset again

```{r}

# faster to make a subset objects of only neurons and use find all markers

neuron.sub <- subset(seu.q, idents = c("Neurons1","Neurons2","Neurons3",
                                       "Neurons4"))

neuron.sub.markers <- FindAllMarkers(neuron.sub)

top5 <- neuron.sub.markers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(neuron.sub, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)

DotPlot(neuron.sub, features = top5$gene) + RotatedAxis()


# neurons 5 was joined to Neurons 2
# cluster markers done again

# Neurons1 : MGP (targeting neural projections BMP signaling), HPD (excitatory and inibitory, could be cell adhesion or apoptosis), MSX1 (transcription factor BMP signaling, midbrain marker, developmental), CYP1B1 (redox homeostatis)   
# MGP, HPD, MSX1, CYP1B1

# there isn't a good proportion of cells expression any of the markers, Neurons2 has the best amount

# Neurons2: TFP12 serine protease melatonin conversion, PTN (cytokine signaling), LYgH (inhances nAChRs), S100A10 (modulates serotonin receptor), IFI27 (antiviral activity)
#TFP12,PTN, LY6H, S100A10, IFI27 

# Neurons3: SOX4, ASCL1 (neurogenesis),

# SOX4 is a marker of 3 but has high expression in 4 as well 

# Neurons4: PCAT4 (not noted in neurons), TPH1 (5HT synthesis), GK5 (neuronal maintainance), SST (somatostatin, GABA spike regulation), TTR (neural protective in AD)
# PCAT4, TPH1, GK5, SST, TTR

# markers to use

# Neurons1: MSX1, CYP1B1      
# Neurons2: LY6H, S100A10     
# Neurons3: SOX4, ASCL1 (Neurogenesis) Immature
# Neurons4: GK5, SST                         More mature



# Maybe neurons 1 and 2 could be merged

# lets see how the markers would look

neurons.1and2 <- FindMarkers(neuron.sub, ident.1 = c("Neurons1","Neurons2"),
                             ident.2 = c("Neurons3","Neurons4"))

top10 <- neurons.1and2 %>% top_n(n=10, wt = avg_log2FC)
ft.up <- rownames(top10) # up in Neurons1 and 3
top10 <- neurons.1and2 %>% top_n(n=-10, wt = avg_log2FC)
ft.down <- rownames(top10)
features <- c(ft.up,ft.down)

DoHeatmap(neuron.sub, features = features, size=3, angle =90, group.bar.height = 0.02)

DotPlot(neuron.sub, features = features) + RotatedAxis()

# all the markers were up regulated in neurons2 and not really neurons1
# I'll keep them separated


```


Use subgrouping and find cluster markers to look at neuronal subtypes.

```{r}

# faster to make a subset objects of only neurons and use find all markers

neuron.sub <- subset(seu.q, idents = c("DAneurons1","DAneurons2","DAneurons3"))

neuron.sub.markers <- FindAllMarkers(neuron.sub)

top5 <- neuron.sub.markers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(neuron.sub, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)

DotPlot(neuron.sub, features = top5$gene) + RotatedAxis()

# markers are much clearer for the DA neuron subgroups

# DA neurons 1: WIF1, CYP1B1, IGFBP3, HPD, WFIKKN2
# WIF1 (secreted WNT inhibitor, promotes regeneration), CYP1B1 (redox homeostatis), IGFBP3 (prolactin secretion regulation hypothalmus), HPD (neuro protective), WFIKKN2 (Receptor for TNC)
# DA neurons2: CDH7, RUNX1T1, ASCL1, DLK1, MEG3
# CDH7 (neuro circuitry development, SEMA), RUNX1T1 (neuronal differentiation), ASCL1 (neuronal differentiation), DLK1 (neural differentation), MEG3 (neural homeostatis)  
# DA neurons3: PCAT4, NEUROD1, NCKAP5, GK5, SST
# PCAT4 (dendritic growth), NEUROD1 (neural differentation), NCKAP5 (Excitory neurons), GK5 (TH ), SST (regualates spike times)

# DA neurons1: CYP1B1, IGFBP3
# DA neurons2: RUNX1T1, ASCL1
# DA neurons3: NEUROD1, NCKAP5


```


Name the Neurons2 FACS population with the Neuron subtype latter.

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.6'
cluster.ids <- c("Neurons-MSX1","Neurons-LY6H","Neurons-ASCL1",
                 "Other","DAneurons-CYP1B1","DAneurons-ASCL1","Neurons-GK5",
                 "DAneurons-NEUROD1","RG","Neurons-LY6H","Epithelial","Endothelial")

unique(seu.q$RNA_snn_res.0.6)

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$cellsubgroups <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'cellsubgroups', repel = TRUE)


```


	preduction summary	Cell_Types
0	neurons	Neurons
1	NPC/oligo/astro/neurons/RG	Neurons
2	neurons	Neurons
3	NPC/RG/Astro	Neurons
4	RG/endo	RG
5	DA neurons	Neurons
6	Neurons	Neurons
7	DA neurons	Neurons
8	RG/endo	Endothelial
9	RG/astro	Astro
10	opc/npc/astro/neurons	Neurons
11	neurons/dividing/RG	Neurons

Clustering higher res
res 1.2	
0	Neurons
1	Neurons
2	Neurons
3	Astro/RG/Neurons
4	RG/Astro/Neurons
5	RG/Neurons/NPC
6	RG/NPC/Endo
7	Neurons DA
8	Neurons
9	Neurons DA
10	Neurons DA
11	RG/Endo
12	RG/Astro 
13	RG/Astro/Neurons
14	Neurons/RG




```{r}

Idents(seu.q) <- 'RNA_snn_res.1.2'
# highlight the DA neurons
cluster.ids <- c("Neurons","Neurons","NPC",
                 "Neurons","Neurons","Neurons",
                 "Neurons DA","Neurons","Neurons DA",
                 "Neurons DA", "Neurons DA","Endothelial",
                 "RG/Astro","RG/Astro/Neurons","Neurons/RG")
cluster.ids <- c("Neurons","Neurons","Neurons",
                 "Neurons","Neurons","NPC",
                 "Neurons","Neurons","Neurons",
                 "Neurons", "Neurons","Endothelial",
                 "Astrocytes","Radial Glia","Radial Glia")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Types <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Types', repel = TRUE)
#DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'RNA_snn_res.1.2', repel = TRUE)

#clustree(seu.q)


```


```{r}

# save the Neurons2 with labels

saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Neurons2LabelsSeu30092022.RDS")

```


FACS population Glia1 (should be astrocytes)

```{r}
# explore filtering
seu <- Glia1
seu
# 
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 1000)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 500)
VlnPlot(seu, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)

# filter more cells

seu.ft <- subset(seu, subset = nFeature_RNA > 300 & nCount_RNA > 500 & nCount_RNA < 10000) 
seu.ft

# still a lot of cells 47295
# will likely remove a lot more with the doublet finder


```

```{r}

saveRDS(seu.ft, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1AstroSeu01102022.RDS")

seu.ft <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1AstroSeu01102022.RDS")

seu.ft <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1AstroSeu01102022.RDS")

```

Doublet finder

```{r}

suppressMessages(require(DoubletFinder))

# filtering out MALAT1 and mitochondrial genes

seu.ft <- seu.ft[!grepl("MALAT1", rownames(seu.ft)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]

# like in the tutorial I'm following MALAT1 is the top most expressed gene.  The top genes are a lot of MT and Ribosomal genes

seu.ft[["percent.rb"]] <- PercentageFeatureSet(seu.ft, pattern = "^RP")

# down sample there are too many cells to run doublet finder
seu.sub <- subset(seu.ft, downsample = 20000)

seu.d = NormalizeData(seu.sub)
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
    verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 15)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)

nExp <- round(ncol(seu.d) * 0.15)  # expect more doublets because there is a lot more cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
# the memory limit is reached here - I could run on compute canada
# For now I'll downsample
# this works

# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]


cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
    DimPlot(seu.d, group.by = DF.name) + NoAxes())

VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)

```




Remove the doublet cells

```{r}
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu.sub)

# 20000 pre filter
# creates the expected percentage

```

Repeat workflow with doublet removed data and find clusters for 

```{r}
seu <- NormalizeData(seu.d, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, reduction = "pca", n.neighbors = 43, dims = 1:30)
DimPlot(seu, reduction = "umap")

seu.q <- FindNeighbors(seu, dims = 1:25, k.param = 43)
seu.q <- FindClusters(seu.q, resolution = c(0,0.2,0.4,0.6))
seu.q <- FindClusters(seu.q, resolution = c(0,0.05,0.1,0.8))
library(clustree)
clustree(seu.q)
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.05')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.1')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.2')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.4')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.6')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.8')




```
Look at some expression markers in a feature plot

```{r}
# genes reported up in Astrocytes
FeaturePlot(seu.q, features = c("GFAP","S100B","AQP4","SLC1A3","GJA1",
                                "APOE","TEAD1","GSTA4","SOX9",
                                "VIM","HMG20A","ALDH1L1"))
# almost no GFAP expression and lots of S100B everywhere

```



Predict cell types

```{r}


# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")

# Midbrain  AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")

# Midbrain AIW002 60 days in culture

AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")


#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)

# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# clusters don't break up by the predicted cell types

############ another predictions now using the AIW organoids

Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"

anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAIW.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAIW.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)

# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid

Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"

anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids) 
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)

# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# save ojbect with predicitons
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1PredictionsSeu01102022.RDS")




```


Predict with the brain scRNAseq

```{r}

pathway <- "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/"
seu.q <- readRDS(paste(pathway,"Glia1LabledSeu301102022.RDS",sep = ""))


# read in the reference dataset
# from Bhaduri midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))
DimPlot(seu.q, group.by = 'Bha.mid.stri.pred')
DimPlot(seu.q, group.by = 'subgroups')

# do the predictions differ with the main cell type groups instead of the cluster in the reference data? 
Idents(seu.r) <- "cell_type"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_type)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))
DimPlot(seu.q, group.by = 'predicted.id')

```





```{r}

# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)

# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
df.top.aiw60$I <- row.names(df.top.aiw60)


# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)

### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)

## these are calculated below
### add in the prediction from brain whole brain data Bhaduri midbrain down sampled
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha1 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha1 <- top.pred.celltype.Bha1[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha1) <- NULL
df.top.Bha1$I <- row.names(df.top.Bha1)

pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
pred.table <- merge(pred.table, df.top.Bha1, by = 'I')
pred.table

```
These predictions are not good.  There are several astrocyte markers by expression levels.  Everything is predicted as Radial glia or oligo dendrocytes


Try to predict with the whole brain and see if it's different

```{r}
# read in the reference dataset
# from Bhaduri midbrain and striatum
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_downsample.RDS")

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha <- Idents(seu.q)
print(table(seu.q$Bha))
DimPlot(seu.q, group.by = 'Bha')
DimPlot(seu.q, group.by = 'subgroups')





```




Try to predict with the astrocyte Kamath data

```{r}

astro.ref <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/Macosko_Data/PD_astro.Rds")
# need to make PCA and UMAP
astro.ref <- NormalizeData(astro.ref)
astro.ref <- FindVariableFeatures(astro.ref, selection.method = "vst", nfeatures = 2000)
astro.ref <- ScaleData(astro.ref)
astro.ref <- RunPCA(astro.ref)
astro.ref <- RunUMAP(astro.ref, reduction = "pca", n.neighbors = 205, dims = 1:25)

colnames(astro.ref@meta.data)


Idents(astro.ref) <- "Cell_Subtype"
DefaultAssay(astro.ref) <- "RNA"

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = astro.ref ,query = seu.q, dims = 1:20)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = astro.ref$Cell_Subtype, k.weight = 10)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$astro.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred', label = TRUE)
table(seu.q$astro.pred)

seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, NA)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$astro.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred.thresh', label = TRUE)
table(seu.q$astro.pred.thresh)

# 19986 Astro_VIM_TNFSRF12A no threshold      Astro_GLYATL2 14
# 8376 Astro_VIM_TNFSRF12A   with 95% threshold


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL



```



Possible predicted in other clusters

```{r}
clustree(seu.q)

```

Make the prediction table for high resolution Res 0.8 12 clusters

```{r}
# AIW002 120 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)

# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
df.top.aiw60$I <- row.names(df.top.aiw60)


# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)

### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$Bha.mid.stri.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)

## these are calculated below
### add in the prediction from brain whole brain data Bhaduri midbrain down sampled
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.8, seu.q$Bha))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha1 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha1 <- top.pred.celltype.Bha1[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha1) <- NULL
df.top.Bha1$I <- row.names(df.top.Bha1)

pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
pred.table <- merge(pred.table, df.top.Bha1, by = 'I')
pred.table

```




Look at cluster markers

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.2'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)

top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)

write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Glia1AstrocytesClusterMarkers_new.csv")

# for the res 0.6 the largers groups 0 and 1 don't have great markers and none of the markers are really very good.  
# I'll rerun with res 0.2
unique(seu.q$RNA_snn_res.0.2)

# still not much better



```



Check cell type markers with EnrichR

```{r}

library(enrichR)

setEnrichrSite("Enrichr") # Human genes
# list of all the databases

# libaries with cell types

db <- c('Allen_Brain_Atlas_up','Descartes_Cell_Types_and_Tissue_2021',
        'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')

# enrichr(genes, databases = NULL)

#I'll run the clusters one at a time

N1.c0 <- ClusterMarkers %>% filter(cluster == 5 & avg_log2FC > 0)
genes <- N1.c0$gene

N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")

N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3


N1.Er.genes.4 <- N1.c0.Er[[4]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.4



# cluster 0 - Cell type marker library - Brain astrocyte top hit and embryonic astrocytes
# gene list in term: brain astrocyte	EFEMP1;NFIA;LIX1;PSAP;KIF21A;S100B;CRYAB;DKK3
# embryo astrocytes	SOX2;BEX1;HMGCS1;PTPRZ1;LIX1;S100B;DKK3;ITM2C

# cluster 1 - stem cell pericyte (brain), stelate, astrocyte

# cluster 2 - hypothalmus, endothelial cells, macrophage
# endothelial CSTB;PRELID1;MT1X;CRIP2;RHOC;TMEM141;MT2A;RPS28;CCDC85B;EIF3I;RBP1;ID1;C4ORF3;ID3;PCBD1;MSX1;PPIC

# cluster 3 - smooth muscle cells

# cluster 4 - NK cells, fibroblasts
# NK cells ITGB1;RAB5C;GSTP1;PDCD5;EEF1B2;TGOLN2;SDCBP;MT2A;LDHA;SNRPD2;YWHAQ;ZNF326;TMSB10;CCDC50
# fibroblasts 	COL3A1;CALD1;COL6A3

# cluster 5 - endothelial cells, NK cells, CD8+

# cluster 6 - stromal cells eurythroblasts, none-neuronal, oligo

# reran and now there are only 5 clusters
# repeat checking


```

Cluster 0 - astrocytes
Cluster 1 - pericyte astrocyte (weak still)
Cluster 2 - endothelial
Cluster 3 - smooth muscle
Cluster 4 - NK/fibroblast
Cluster 5 - endothelial
Cluster 6 - non- neuronal







```{r}

VlnPlot(seu.q, features = c("CD44","ITGB1","S100B"), group.by = 'orig.ident')
VlnPlot(seu.ft, features = c("CD44","ITGB1","S100B"), group.by = 'orig.ident')

```

Check expression of known markers

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.2'

feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")

DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list) +RotatedAxis()

PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")

DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()

ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")

feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list)+RotatedAxis()


mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6") 
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02)
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 

### glia markers
microglia = c("PTPRC","AIF1","ADGRE1")  # ADGRE1 is a microglia marker F4/80, CD45 is PTPRC, gene name IBA1 is AIF1
astolgNPCpromicro = c("GFAP","S100B","SLC1A2","MBP","SOX10","SPP1","DCX","NEUROD1","TBR1","PCNA","MKI67","PTPRC","AIF1","ADGRE1")
# note GLT1 is EAAT2 which is SLC1A2 glutatmate transporter
# epithelial
epi = c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1") # e-cadherin is CDH1

DoHeatmap(seu.q, features = astolgNPCpromicro, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = astolgNPCpromicro)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 
DoHeatmap(seu.q, features = epi, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = epi)+RotatedAxis()

# also add Radial glia marker overlap with Glia and Neurons

features <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
              "ITGB5","IL6ST")
DoHeatmap(seu.q, features = features, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = features)+RotatedAxis()




```

No TH expression

clusters 5 VIM highest, S100 B 
Cluster 4 
Cluster 3 has some cells with high OTX2, NES indicates NPC/Precursors
Cluster 2 has some SOX2 and PAX6 indicates NPC, VAMP2 indicating neurons, S100B highest and most - indicates astrocytes, also MBP indicates oligos
Cluster 1
Cluster 0 


Lable the clusters 

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.2'

#seu.q <- BuildClusterTree(seu.q, reorder = TRUE, reorder.numeric = TRUE)
unique(seu.q$RNA_snn_res.0.2)

cluster.ids <- c("Astrocytes1","Astrocytes2","Precursors","RG1","RG2","Endothelial")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)

#DimPlot(seu.q, group.by = 'RNA_snn_res.0.2', label = TRUE)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)

# something weird is going on in the order

#saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")


```



Compare the Astrocyte groups and get some markers for sub groups

```{r}


astro.sub.markers <- FindMarkers(seu.q, ident.1 = "Astrocytes1", ident.2 = "Astrocytes2", only.pos = FALSE)
#top5 <- astro.sub.markers %>% top_n(n=5, wt = avg_log2FC)

DoHeatmap(seu.q, features = c("PLCG2","PTPRZ1","SNHG25","VCAN","LUM","DCN","S1004A"), size=3, angle =90, group.bar.height = 0.02)

astro2 <- rownames(astro.sub.markers %>% filter(avg_log2FC < 0.05))

DotPlot(seu.q, features = c("PLCG2","PTPRZ1","SNHG25","VCAN","LUM","DCN","S1004A")) + RotatedAxis()

DoHeatmap(seu.q, features = astro2[1:15], size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = astro2[1:15]) +RotatedAxis()


# radial glia subtyping
Idents(seu.q) <- ('subgroups')
rg.sub.markers <- FindMarkers(seu.q, ident.1 = "RG1", ident.2 = "RG2", only.pos = FALSE)
top5.up <- rg.sub.markers %>% top_n(n=10, wt = avg_log2FC)
top5.down <- rg.sub.markers %>% top_n(n=-10, wt = avg_log2FC)
ft <- rownames(top5.up)

DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()


ft <- rownames(top5.down)

DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()

npc.markers <- FindMarkers(seu.q, ident.1 = "Precursors", ident.2 = c("Astrocytes2","Astrocytes1"), only.pos = FALSE)
top10.npc <- npc.markers %>% top_n(n=10, wt = avg_log2FC)
npc.markers <- npc.markers %>% filter(avg_log2FC > 0)
dim(npc.markers)

ft <- rownames(top10.npc)
# consider naming precursors as astrocytes3
DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()


npc.markers.rg <- FindMarkers(seu.q, ident.1 = "Precursors", ident.2 = c("RG2","RG1"), only.pos = TRUE)
top10.npc <- npc.markers.rg %>% top_n(n=10, wt = avg_log2FC)

ft <- rownames(top10.npc)
# consider naming precursors as astrocytes3
DoHeatmap(seu.q, features = ft, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ft) + RotatedAxis()

## these have more differences from RG than Astrocytes


```

Add subtype gene ids

```{r}

DimPlot(seu.q, group.by = 'RNA_snn_res.0.2')

cluster.ids <- c("Astrocytes-PLCG2","Astrocytes-DNC","Astrocytes-IGFBP2",
                 "RG1-CDKN1C","RG2-TYRP1","Endothelial")
#cluster.ids <- c("Astrocytes1","Astrocytes2","Precursors(Astrocytes)","RG1","RG2","Endothelial")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_types <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_types', repel = TRUE)


saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")

# label with main cell type groups 

DimPlot(seu.q, group.by = 'RNA_snn_res.0.2')

cluster.ids <- c("Astrocytes","Astrocytes","Astrocytes",
                 "RG","RG","Endothelial")
#cluster.ids <- c("Astrocytes1","Astrocytes2","Precursors(Astrocytes)","RG1","RG2","Endothelial")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Type <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Type', repel = TRUE)


saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")




```


Label main cell types 
Res 0.8	predicitons
0	astro
1	Endo/RG/Astro
2	Astrocyte
3	RG/Endo/Astro
4	Astro
5	Astro
6	Epithela, endo, astro, RG
7	Astro
8	Astro
9	Astro/RG/Endo
10	Astro/RG
11	Astro/Neurons
12	Endo/RG. 


```{r}

#DimPlot(seu.q, group.by = 'RNA_snn_res.0.8')
Idents(seu.q) <- 'RNA_snn_res.0.8'

cluster.ids <- c("Astrocytes","Astrocytes-Endo-RG","Astrocytes",
                 "RG-Endo-Astro","Astro","Astro","Epi-Endo-Astro-RG",
                 "Astro","Astro","Astro-RG-Endo","Astro-RG","Astro-Neurons","Endo-RG")

cluster.ids <- c("Astrocytes","Astrocytes","Astrocytes",
                 "Astrocytes","Astrocytes","Astrocytes","Endothelial",
                 "Astrocytes","Astrocytes","Radial Glia","Radial Glia","Astrocytes","Endothelial")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Type <- Idents(seu.q)

DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Type', repel = TRUE)


saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia1LabledSeu301102022.RDS")



```

```{r}
table(seu.q$Cell_Type)

```



Quick check the Glial2 

```{r}

# explore filtering
seu <- Glia2
seu
# 
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 1000)
VlnPlot(seu, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 500)
VlnPlot(seu, pt.size = 0.10, features = c("nCount_RNA"), y.max = 2000)

# filter more cells

seu.ft <- subset(seu, subset = nFeature_RNA > 250 & nCount_RNA > 250 & nCount_RNA < 10000) 
seu.ft

VlnPlot(seu.ft, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 2000)

```
```{r}

VlnPlot(seu.ft.glia, features = c("CD44","S100B","ITGB1"))
VlnPlot(seu.ft, features = c("CD44","S100B","ITGB1"), group.by = 'orig.ident')
# both glia populations are similar


```

Levels seem similar in Glia1 and Glia2

Remove doublets and start to process Glia 2

```{r}

suppressMessages(require(DoubletFinder))

# filtering out MALAT1 and mitochondrial genes

seu.ft <- seu.ft[!grepl("MALAT1", rownames(seu.ft)), ]
seu.ft <- seu.ft[!grepl("^MT-", rownames(seu.ft)), ]

# like in the tutorial I'm following MALAT1 is the top most expressed gene.  The top genes are a lot of MT and Ribosomal genes

seu.ft[["percent.rb"]] <- PercentageFeatureSet(seu.ft, pattern = "^RP")

seu.d = NormalizeData(seu.ft)
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
    verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 15)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)

nExp <- round(ncol(seu.d) * 0.08)  # expect more doublets because there is a lot more cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
# the memory limit is reached here - I could run on compute canada
# For now I'll downsample
# this works

# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]


cowplot::plot_grid(ncol = 2, DimPlot(seu.d, group.by = "orig.ident") + NoAxes(),
    DimPlot(seu.d, group.by = DF.name) + NoAxes())

VlnPlot(seu.d, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)

```


```{r}

seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
dim(seu.ft)



```

Cluster 

```{r}
seu <- NormalizeData(seu.d, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu)
seu <- RunPCA(seu)
seu <- RunUMAP(seu, reduction = "pca", n.neighbors = 25, dims = 1:30)
DimPlot(seu, reduction = "umap")

seu.q <- FindNeighbors(seu, dims = 1:25, k.param = 25)
seu.q <- FindClusters(seu.q, resolution = c(0,0.05,0.2,0.4,0.5,0.6,0.8))
library(clustree)

DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.05')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.1')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.2')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.4')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.6')
DimPlot(seu.q, reduction = "umap", group.by = 'RNA_snn_res.0.8')
clustree(seu.q)
# 0.4 is likely the best annotate subgroups


```

Predict cell types 


```{r}

# SNCA and control midbrain organoids 165 days in culture
MBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AST23_BrainComm/MBOclusters_names29072021.rds")

# Midbrain  AIW002 120 days in culture
AIWMBO <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio120days/MOintegratedClusterK123res0.8.names_nov16_2021")

# Midbrain AIW002 60 days in culture

AIW60 <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/AIWtrio60days/AWI002ParkinKOPinkKO60days_labels_14052022.rds")


#first predict with the MBO data
Idents(MBO) <- "cluster_labels"
DefaultAssay(MBO) <- "RNA"

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = MBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = MBO$cluster_labels)
seu.q <- AddMetaData(seu.q, metadata = predictions)

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$MBOAST23.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'MBOAST23.pred', label = TRUE)

# see how accurate the predictions are
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "None")

Idents(seu.q) <- 'predicted.id'
seu.q$MBOAST23.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'predicted.id', label = TRUE)
table(seu.q$MBOAST23.pred)
table(seu.q$MBOAST23.thresh)


 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.4, seu.q$MBOAST23.pred))
t.lables$Freq <- as.double(t.lables$Freq)

# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# clusters don't break up by the predicted cell types

############ another predictions now using the AIW organoids

Idents(AIWMBO) <- "res08names"
DefaultAssay(AIWMBO) <- "RNA"

anchors <- FindTransferAnchors(reference = AIWMBO ,query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIWMBO$res08names)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW120.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW120.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.4, seu.q$MBOAIW.pred))
t.lables$Freq <- as.double(t.lables$Freq)
# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# see how accurate the predictions are
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "None")

Idents(seu.q) <- 'predicted.id'
seu.q$AIW120.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW120.thresh', label = TRUE)
table(seu.q$AIW120.pred)
table(seu.q$AIW120.thresh)

# the predicted cell types make more sense from the AIW002 organoid
# now predict with the AIW002 60 days organoid

Idents(AIW60) <- "cluster.ids"
DefaultAssay(AIW60) <- "RNA"

anchors <- FindTransferAnchors(reference = AIW60, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = AIW60$cluster.ids) 
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$AIW60.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.pred', label = TRUE)
 
## check the proportion of cell types predicted in each cluster
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.4, seu.q$AIW60.pred))
t.lables$Freq <- as.double(t.lables$Freq)

# try bar chart
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity")

# see how accurate the predictions are
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "None")

Idents(seu.q) <- 'predicted.id'
seu.q$AIW60.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'AIW60.thresh', label = TRUE)
table(seu.q$AIW60.pred)
table(seu.q$AIW60.thresh)



# most of the cells are predicted as NPCs in many populations



```

```{r}
# save with predictions so far
#saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")

seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")


```


See how many cells are predicted as astrocytes with the threshold

```{r}

DAsubtypes <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/Macosko_Data/DAsubgroups_processed.Rds")

Idents(astro.ref) <- "Cell_Subtype"
DefaultAssay(astro.ref) <- "RNA"

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = DAsubtypes ,query = seu.q, dims = 1:20)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = astro.ref$Cell_Subtype, k.weight = 10)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$astro.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred', label = TRUE)
table(seu.q$astro.pred)

seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$astro.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'astro.pred.thresh', label = TRUE)
table(seu.q$astro.pred.thresh)


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL

# a lot of these cells are also getting labelled as astrocytes


```

Do these get labelled as DA neurons too???

```{r}
seu.q <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")

```



```{r}

DAsubtypes <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/Macosko_Data/DAsubgroups_processed.Rds")
Idents(DAsubtypes) <- "Cell_Subtype"
da.ref <- subset(DAsubtypes, downsample = 500)

# find the reference anchors
print("finding reference anchors")
anchors <- FindTransferAnchors(reference = da.ref, query = seu.q, dims = 1:20)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = da.ref$Cell_Subtype, k.weight = 10)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
# add new dataslot for MBO predicted ID to make the next prediction
seu.q$da.pred <- Idents(seu.q)
DimPlot(seu.q, group.by = 'da.pred', label = TRUE)
table(seu.q$da.pred)

seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")

Idents(seu.q) <- 'predicted.id'
seu.q$da.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'da.pred.thresh', label = TRUE)
table(seu.q$da.pred.thresh)


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$da.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.da$Freq),]
row.names(df.top.astro) <- NULL


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.astro[order(top.pred.astro$Var1,-top.pred.astro$Freq),]
row.names(df.top.astro) <- NULL


# after thresholding very few cells are predicted as neurons



```

{redicte with the brain scRNAseq

```{r}

pathway <- "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/"
seu.q <- readRDS(paste(pathway,"Glia2LabledSeu03102022.RDS",sep = ""))

# midbrain and striatum

seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))


seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh', label = TRUE)
table(seu.q$Bha.mid.pred.thresh)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh')


# read in the reference dataset
# whole brain Bhaduri down sampled
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_downsample.RDS")

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.pred <- Idents(seu.q)
print(table(seu.q$Bha.pred))
DimPlot(seu.q, group.by = 'Bha.pred')
DimPlot(seu.q, group.by = 'subgroups')

seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.pred.thresh', label = TRUE)
table(seu.q$Bha.mid.pred.thresh)
DimPlot(seu.q, group.by = 'Bha.pred.thresh')


```



Compare predictions - make a predictions table

```{r}

# AIW002 120 days predictions - take the thresholded options
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$AIW120.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW120 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw120 <- top.pred.celltype.AIW120[order(top.pred.celltype.AIW120$Var1,-top.pred.celltype.AIW120$Freq),]
row.names(df.top.aiw120) <- NULL
df.top.aiw120$I <- row.names(df.top.aiw120)

# AIW002 60 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$AIW60.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AIW60 <-as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.aiw60 <- top.pred.celltype.AIW60[order(top.pred.celltype.AIW60$Var1,-top.pred.celltype.AIW60$Freq),]
row.names(df.top.aiw60) <- NULL
df.top.aiw60$I <- row.names(df.top.aiw60)


# AST23 165 days predictions
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$MBOAST23.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.AST23 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.AST23 <- top.pred.celltype.AST23[order(top.pred.celltype.AST23$Var1,-top.pred.celltype.AST23$Freq),]
row.names(df.top.AST23) <- NULL
df.top.AST23$I <- row.names(df.top.AST23)

# add the threshold Astro predictions 
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$astro.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.astro <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.astro <- top.pred.celltype.astro[order(top.pred.celltype.astro$Var1,-top.pred.celltype.astro$Freq),]
row.names(df.top.astro) <- NULL
df.top.astro$I <- row.names(df.top.astro)

# add the neurons predictions 
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$da.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.da <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.da <- top.pred.celltype.da[order(top.pred.celltype.da$Var1,-top.pred.celltype.da$Freq),]
row.names(df.top.da) <- NULL
df.top.da$I <- row.names(df.top.da)


### add in the prediction from brain data Bhaduri midbrain and striatum
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha.mid.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha <- top.pred.celltype.Bha[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha) <- NULL
df.top.Bha$I <- row.names(df.top.Bha)

## these are calculated below
### add in the prediction from brain whole brain data Bhaduri midbrain down sampled
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$Bha.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype.Bha1 <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top.Bha1 <- top.pred.celltype.Bha1[order(top.pred.celltype.Bha$Var1,-top.pred.celltype.Bha$Freq),]
row.names(df.top.Bha1) <- NULL
df.top.Bha1$I <- row.names(df.top.Bha1)

pred.table <- merge(df.top.AST23, df.top.aiw60, by = 'I', all = TRUE)
pred.table <- merge(pred.table, df.top.aiw120, by = 'I')
pred.table <- merge(pred.table, df.top.Bha, by = 'I')
pred.table <- merge(pred.table, df.top.Bha1, by = 'I')
pred.table



```

Predicted cluster annotations
0	Unknown/ NPC
1	RG
2	astro
3	RG
4	neurons
5	RG

Predict from developing cortex 

```{r}

pathway <- "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/"
seu.q <- readRDS(paste(pathway,"Glia2LabledSeu03102022.RDS",sep = ""))

seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Nowakowski_dev_cortext.RDS")
colnames(seu.r@meta.data)

Idents(seu.r) <- "WGCNAcluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$WGCNAcluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$dev.cortex.pred <- Idents(seu.q)
print(table(seu.q$dev.cortex.pred))
DimPlot(seu.q, group.by = 'dev.cortex.pred')


# add the threshold 
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.8, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$dev.cortex.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'dev.cortex.pred.thresh', label = TRUE)
table(seu.q$dev.cortex.pred.thresh)


t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$dev.cortex.pred.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top <- top.pred.celltype[order(top.pred.celltype$Var1,-top.pred.celltype$Freq),]
row.names(df.top) <- NULL
df.top$I <- row.names(df.top)


# almost everything is predicted as 'none' when theshold is .95 check predictions without threshold and there are many 
# run again with lower threshold 0.8

t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$dev.cortex.pred))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(4, Freq))
df.top <- top.pred.celltype[order(top.pred.celltype$Var1,-top.pred.celltype$Freq),]
row.names(df.top) <- NULL
df.top$I <- row.names(df.top)

df.top.ft <- df.top %>% filter(Freq > 0)


```


The dev cortex doesn't predict Glia 2 well
Try the developing forebrain

```{r}
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Karolinski_DevForebrain_downsample_Level1.RDS")
colnames(seu.r@meta.data)
DefaultAssay(seu.r) <- 'RNA'

# clusters has subgroups
# levels are main cell groups 
Idents(seu.r) <- "Level1"


# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$Level1)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$fb.pred <- Idents(seu.q)
print(table(seu.q$fb.pred))
DimPlot(seu.q, group.by = 'fb.pred')


# add the threshold 
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.80, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$fb.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'fb.thresh', label = TRUE)
table(seu.q$fb.thresh)

# make the tables 
t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$fb.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(2, Freq))
df.top <- top.pred.celltype[order(top.pred.celltype$Var1,-top.pred.celltype$Freq),]
row.names(df.top) <- NULL
df.top$I <- row.names(df.top)

#VLMC is vascular and leptomeninges

# almost everything is predicted as 'none' when theshold is .95 
# run again with lower threshold 0.8 and many are predicted as RG

# try predicting with the cluster labels 
# later I can subset cell types for the reference data

Idents(seu.r) <- "Clusters"

anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$Clusters)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$fb.sub.pred <- Idents(seu.q)
print(table(seu.q$fb.sub.pred))
DimPlot(seu.q, group.by = 'fb.sub.pred')


# add the threshold 
seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.80, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$fb.sub.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'fb.sub.thresh', label = TRUE)
table(seu.q$fb.sub.thresh)

t.lables <- as.data.frame(table(seu.q$RNA_snn_res.0.2, seu.q$fb.sub.thresh))
t.lables$Freq <- as.double(t.lables$Freq)
ggplot(t.lables, aes(y = Freq, x = Var1, fill = Var2)) + geom_bar(position = "stack", stat= "identity") + RotatedAxis() 
top.pred.celltype <- as.data.frame(t.lables  %>% group_by(Var1)  %>% top_n(4, Freq))
df.top <- top.pred.celltype[order(top.pred.celltype$Var1,-top.pred.celltype$Freq),]
row.names(df.top) <- NULL
df.top$I <- row.names(df.top)

df.top.ft <- df.top %>% filter(Freq > 0)

```

Predictions with thresholds:

0 - RG
1 - RG
2 - none
3 - RG
4 - Neural precursor
5 - VLMC (vascular and leptomeninges)

Look at gene lists with known markers

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.2'

# many cell types list
feature_list = c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")

DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list) +RotatedAxis()

# Dopaminergic markers
PD_poulin = c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")

DoHeatmap(seu.q, features = PD_poulin, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = PD_poulin)+RotatedAxis()

ealryNeur = c("DCX","NEUROD1","TBR1")
proliferation = c("PCNA","MKI67")
neuralstem = c("SOX2","NES","PAX6","MASH1")

feature_list <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
DoHeatmap(seu.q, features = feature_list, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = feature_list)+RotatedAxis()


mat_neuron = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6") 
# NeuN is FOX3 - RBFOX3
# PSD95 also SP-90 or DLG4
# VGLUT2 is SLC17A6
DoHeatmap(seu.q, features = mat_neuron, size=3, angle =90, group.bar.height = 0.02)
# cluster 4 also show mature neuron markers
DotPlot(seu.q, features = mat_neuron)+RotatedAxis()
# excitatory neuron markers
ex = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
DoHeatmap(seu.q, features = ex, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = ex)+RotatedAxis()
# inhibitory neuron markers
inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
DoHeatmap(seu.q, features = inh, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = inh)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 

### glia markers
microglia = c("PTPRC","AIF1","ADGRE1")  # ADGRE1 is a microglia marker F4/80, CD45 is PTPRC, gene name IBA1 is AIF1
astolgNPCpromicro = c("GFAP","S100B","SLC1A2","MBP","SOX10","SPP1","DCX","NEUROD1","TBR1","PCNA","MKI67","PTPRC","AIF1","ADGRE1")
# note GLT1 is EAAT2 which is SLC1A2 glutatmate transporter
# epithelial
epi = c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1") # e-cadherin is CDH1

DoHeatmap(seu.q, features = astolgNPCpromicro, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = astolgNPCpromicro)+RotatedAxis()
# cluster 4 is more excitatory than inhbitory but neither marker set has much expression 
DoHeatmap(seu.q, features = epi, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = epi)+RotatedAxis()

# also add Radial glia marker overlap with Glia and Neurons

features <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
              "ITGB5","IL6ST")
DoHeatmap(seu.q, features = features, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = features)+RotatedAxis()

# radial glia markers
rg <- c("VIM","NES","PAX6","HES1","EAAT1","NCAD1","SOX2","FABP7")
DoHeatmap(seu.q, features = rg, size=3, angle =90, group.bar.height = 0.02)
DotPlot(seu.q, features = rg)+RotatedAxis()

# NPC and radial glia are very similar

```
Marker expression predictions
Cluster 0 - unknown
Cluster 1 - RG
Cluster 2 - unknown
Cluster 3 - RG
cluster 4 - immature neurons
Cluster 5 - RG, opc


Check the levels of RNA in each cluster 

```{r}
VlnPlot(seu.q, features = "nFeature_RNA")

```
Cluster 0 and 2 have fewer sequences than other groups and thus no markers
Possibly remove these is they don't come up with some markers

Find cluster markers

```{r}
Idents(seu.q) <- 'RNA_snn_res.0.2'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)

top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)

#write.csv(ClusterMarkers,"/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/Glia2RGClusterMarkers_new.csv")

Idents(seu.q) <- 'RNA_snn_res.0.1'
ClusterMarkers <- FindAllMarkers(seu.q, only.pos = TRUE)

top5 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=5, wt = avg_log2FC)
DoHeatmap(seu.q, features = top5$gene, size=3, angle =90, group.bar.height = 0.02)


```
Markers of 2 are matching with 5 possibly merge these together
Cluster 0 markers don't look up regulated but the list is long

Look at the libraries

```{r}
library(enrichR)

setEnrichrSite("Enrichr") # Human genes
# list of all the databases

# libaries with cell types

db <- c('Descartes_Cell_Types_and_Tissue_2021',
        'CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')

# enrichr(genes, databases = NULL)

#I'll run the clusters one at a time

N1.c0 <- ClusterMarkers %>% filter(cluster == 0 & avg_log2FC > 0)
genes <- N1.c0$gene

N1.c0.Er <- enrichr(genes, databases = db)
plotEnrich(N1.c0.Er[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")
plotEnrich(N1.c0.Er[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value")


N1.Er.genes.1 <- N1.c0.Er[[1]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.1

N1.Er.genes.2 <- N1.c0.Er[[2]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.2

N1.Er.genes.3 <- N1.c0.Er[[3]] %>% select(Term, Genes, Combined.Score)
N1.Er.genes.3






```

The adult brain doesn't predict radial glia - there are radial glia but I believe these are different from the 'neuroblast' type radial glia

I will use a developing brain reference

```{r}
DimPlot(seu.q)

```



```{r}

seu.q <- readRDS(paste(pathway,"Glia2LabledSeu03102022.RDS",sep = ""))

# midbrain and striatum

seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_midbrain_striatum.RDS")

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.stri.pred <- Idents(seu.q)
print(table(seu.q$Bha.mid.stri.pred))


seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh', label = TRUE)
table(seu.q$Bha.mid.pred.thresh)
DimPlot(seu.q, group.by = 'Bha.mid.pred.thresh')


# read in the reference dataset
# whole brain Bhaduri down sampled
seu.r <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/PublicData/Bhaduri_wholeBrain/Bhaduri_downsample.RDS")

Idents(seu.r) <- "cell_cluster"

# find the reference anchors
anchors <- FindTransferAnchors(reference = seu.r, query = seu.q, dims = 1:25)
print("getting predictions")
predictions <- TransferData(anchorset = anchors, refdata = seu.r$cell_cluster)
seu.q <- AddMetaData(seu.q, metadata = predictions)
print(table(seu.q$predicted.id))

Idents(seu.q) <- 'predicted.id'
seu.q$Bha.pred <- Idents(seu.q)
print(table(seu.q$Bha.pred))
DimPlot(seu.q, group.by = 'Bha.pred')
DimPlot(seu.q, group.by = 'subgroups')

seu.q$predicted.id <- ifelse(seu.q$prediction.score.max > 0.95, seu.q$predicted.id, "none")
Idents(seu.q) <- 'predicted.id'
seu.q$Bha.mid.pred.thresh <- Idents(seu.q)
DimPlot(seu.q, group.by = 'Bha.pred.thresh', label = TRUE)
table(seu.q$Bha.mid.pred.thresh)
DimPlot(seu.q, group.by = 'Bha.pred.thresh')


```





Add some cell type annotations

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.2'

cluster.ids <- c("Glia1","RG1","Glia2","RG2","NeuronsImmature","RG3")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$subgroups <- Idents(seu.q)

#DimPlot(seu.q, group.by = 'RNA_snn_res.0.2', label = TRUE)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'subgroups', repel = TRUE)



```

```{r}
# save file
saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")

```

Main cell groups

```{r}

Idents(seu.q) <- 'RNA_snn_res.0.2'

cluster.ids <- c("Radial Glia","Radial Glia","Radial Glia","Radial Glia","NPC","Other")

names(cluster.ids) <- levels(seu.q)
seu.q <- RenameIdents(seu.q, cluster.ids)
seu.q$Cell_Types <- Idents(seu.q)

#DimPlot(seu.q, group.by = 'RNA_snn_res.0.2', label = TRUE)
DimPlot(seu.q, reduction = "umap", label = TRUE, group.by = 'Cell_Types', repel = TRUE)

saveRDS(seu.q, "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/Glia2LabledSeu03102022.RDS")

```

Proportions of cell types

```{r}

table(seu.q$Cell_Types)
dim(seu.q)

prp <- as.data.frame(table(seu.q$Cell_Types))
prp

prp$prop <- prp$Freq/sum(prp$Freq)*100
prp$Sample <- 'RadialGlia'
prp


```

I'll calculate the proportions for each cell type and make a table or plot in the comparison workbook. 



